Branch Coverage

udpipe/udpipe.cpp
Criterion Covered Total %
branch 1707 16906 10.1


line true false branch
93 0 0 return os.write(str.str, str.len);
0 0 return os.write(str.str, str.len);
0 0 return os.write(str.str, str.len);
0 0 return os.write(str.str, str.len);
0 0 return os.write(str.str, str.len);
0 0 return os.write(str.str, str.len);
0 0 return os.write(str.str, str.len);
0 0 return os.write(str.str, str.len);
0 0 return os.write(str.str, str.len);
0 0 return os.write(str.str, str.len);
97 1 3 return a.len == b.len && memcmp(a.str, b.str, a.len) == 0;
0 1 return a.len == b.len && memcmp(a.str, b.str, a.len) == 0;
1 2 return a.len == b.len && memcmp(a.str, b.str, a.len) == 0;
1 0 return a.len == b.len && memcmp(a.str, b.str, a.len) == 0;
0 3 return a.len == b.len && memcmp(a.str, b.str, a.len) == 0;
0 0 return a.len == b.len && memcmp(a.str, b.str, a.len) == 0;
0 3 return a.len == b.len && memcmp(a.str, b.str, a.len) == 0;
0 0 return a.len == b.len && memcmp(a.str, b.str, a.len) == 0;
1 2 return a.len == b.len && memcmp(a.str, b.str, a.len) == 0;
0 1 return a.len == b.len && memcmp(a.str, b.str, a.len) == 0;
1 1 return a.len == b.len && memcmp(a.str, b.str, a.len) == 0;
0 1 return a.len == b.len && memcmp(a.str, b.str, a.len) == 0;
0 1 return a.len == b.len && memcmp(a.str, b.str, a.len) == 0;
0 0 return a.len == b.len && memcmp(a.str, b.str, a.len) == 0;
1 0 return a.len == b.len && memcmp(a.str, b.str, a.len) == 0;
0 1 return a.len == b.len && memcmp(a.str, b.str, a.len) == 0;
15 3 return a.len == b.len && memcmp(a.str, b.str, a.len) == 0;
0 15 return a.len == b.len && memcmp(a.str, b.str, a.len) == 0;
3 0 return a.len == b.len && memcmp(a.str, b.str, a.len) == 0;
0 3 return a.len == b.len && memcmp(a.str, b.str, a.len) == 0;
0 16 return a.len == b.len && memcmp(a.str, b.str, a.len) == 0;
0 0 return a.len == b.len && memcmp(a.str, b.str, a.len) == 0;
16 0 return a.len == b.len && memcmp(a.str, b.str, a.len) == 0;
0 16 return a.len == b.len && memcmp(a.str, b.str, a.len) == 0;
0 0 return a.len == b.len && memcmp(a.str, b.str, a.len) == 0;
0 0 return a.len == b.len && memcmp(a.str, b.str, a.len) == 0;
0 0 return a.len == b.len && memcmp(a.str, b.str, a.len) == 0;
0 0 return a.len == b.len && memcmp(a.str, b.str, a.len) == 0;
0 0 return a.len == b.len && memcmp(a.str, b.str, a.len) == 0;
0 0 return a.len == b.len && memcmp(a.str, b.str, a.len) == 0;
0 0 return a.len == b.len && memcmp(a.str, b.str, a.len) == 0;
0 0 return a.len == b.len && memcmp(a.str, b.str, a.len) == 0;
207 0 0 class multiword_token : public token {
0 0 class multiword_token : public token {
0 0 class multiword_token : public token {
229 0 0 class word : public token {
259 0 0 class sentence {
0 0 class sentence {
0 0 class sentence {
417 0 0 pair_system_gold(const word& system, const word& gold) : system(system), gold(gold) {}
492 0 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
0 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
16 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
0 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
1 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
6 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
1 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
28 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
0 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
0 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
0 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
1 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
1 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
0 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
0 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
0 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
18 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
0 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
0 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
0 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
0 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
0 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
0 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
0 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
0 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
0 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
0 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
0 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
0 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
0 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
0 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
0 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
0 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
0 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
0 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
0 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
0 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
0 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
0 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
0 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
7 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
22 0 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
496 30 0 if (chr < CHARS) {
498 2 28 if ((othercase & 0xFF) == othercase_type::LOWER_ONLY) return othercase >> 8;
499 0 28 if ((othercase & 0xFF) == othercase_type::LOWER_THEN_UPPER) return othercase >> 8;
500 0 28 if ((othercase & 0xFF) == othercase_type::TITLE_THEN_LOWER) return othercase_block[othercase_index[(othercase >> 8) >> 8]][(othercase >> 8) & 0xFF] >> 8;
506 0 0 if (chr < CHARS) {
508 0 0 if ((othercase & 0xFF) == othercase_type::UPPERTITLE_ONLY) return othercase >> 8;
509 0 0 if ((othercase & 0xFF) == othercase_type::UPPER_ONLY) return othercase >> 8;
510 0 0 if ((othercase & 0xFF) == othercase_type::UPPER_THEN_TITLE) return othercase >> 8;
511 0 0 if ((othercase & 0xFF) == othercase_type::LOWER_THEN_UPPER) return othercase_block[othercase_index[(othercase >> 8) >> 8]][(othercase >> 8) & 0xFF] >> 8;
605 54 0 if (((unsigned char)*str) < 0x80) return (unsigned char)*str++;
606 0 0 else if (((unsigned char)*str) < 0xC0) return ++str, REPLACEMENT_CHAR;
607 0 0 else if (((unsigned char)*str) < 0xE0) {
609 0 0 if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
0 0 if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
611 0 0 } else if (((unsigned char)*str) < 0xF0) {
613 0 0 if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
0 0 if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
615 0 0 if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
0 0 if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
617 0 0 } else if (((unsigned char)*str) < 0xF8) {
619 0 0 if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
0 0 if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
621 0 0 if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
0 0 if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
623 0 0 if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
0 0 if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
629 145 0 if (!len) return 0;
631 122 23 if (((unsigned char)*str) < 0x80) return (unsigned char)*str++;
632 0 23 else if (((unsigned char)*str) < 0xC0) return ++str, REPLACEMENT_CHAR;
633 23 0 else if (((unsigned char)*str) < 0xE0) {
635 23 0 if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
23 0 if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
23 0 if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
637 0 0 } else if (((unsigned char)*str) < 0xF0) {
639 0 0 if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
0 0 if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
0 0 if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
641 0 0 if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
0 0 if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
0 0 if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
643 0 0 } else if (((unsigned char)*str) < 0xF8) {
645 0 0 if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
0 0 if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
0 0 if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
647 0 0 if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
0 0 if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
0 0 if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
649 0 0 if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
0 0 if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
0 0 if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR;
674 0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
36 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
33 3 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 18 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
15 3 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
0 0 iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
706 0 0 iterator& operator++() { if (!len) next = nullptr; if (next) codepoint = decode(next, len); return *this; }
0 0 iterator& operator++() { if (!len) next = nullptr; if (next) codepoint = decode(next, len); return *this; }
740 25 5 if (chr < 0x80) str += chr;
741 5 0 else if (chr < 0x800) { str += 0xC0 + (chr >> 6); str += 0x80 + (chr & 0x3F); }
742 0 0 else if (chr < 0x10000) { str += 0xE0 + (chr >> 12); str += 0x80 + ((chr >> 6) & 0x3F); str += 0x80 + (chr & 0x3F); }
743 0 0 else if (chr < 0x200000) { str += 0xF0 + (chr >> 18); str += 0x80 + ((chr >> 12) & 0x3F); str += 0x80 + ((chr >> 6) & 0x3F); str += 0x80 + (chr & 0x3F); }
750 0 0 for (char32_t chr; (chr = decode(str)); )
757 29 7 while (len)
762 0 0 map(f, str.c_str(), result);
0 0 map(f, str.c_str(), result);
0 0 map(f, str.c_str(), result);
0 0 map(f, str.c_str(), result);
0 0 map(f, str.c_str(), result);
809 0 0 unique_ptr conllu_input(input_format::new_conllu_input_format());
810 0 0 if (!conllu_input) return error.assign("Cannot allocate CoNLL-U input format instance!"), false;
0 0 if (!conllu_input) return error.assign("Cannot allocate CoNLL-U input format instance!"), false;
812 0 0 vector plain_text_paragraphs(1); unsigned space_after_nos = 0;
813 0 0 sentence system, gold;
0 0 sentence system, gold;
817 0 0 while (conllu_input->read_block(is, block)) {
0 0 while (conllu_input->read_block(is, block)) {
818 0 0 conllu_input->set_text(block);
819 0 0 while (conllu_input->next_sentence(gold, error)) {
0 0 while (conllu_input->next_sentence(gold, error)) {
820 0 0 gold_data.add_sentence(gold);
823 0 0 if (tokenizer != NONE) {
824 0 0 if (gold.get_new_doc() || gold.get_new_par()) {
0 0 if (gold.get_new_doc() || gold.get_new_par()) {
0 0 if (gold.get_new_doc() || gold.get_new_par()) {
0 0 if (gold.get_new_doc() || gold.get_new_par()) {
0 0 if (gold.get_new_doc() || gold.get_new_par()) {
825 0 0 plain_text_paragraphs.back().append("\n\n");
826 0 0 plain_text_paragraphs.emplace_back();
829 0 0 for (size_t i = 1, j = 0; i < gold.words.size(); i++) {
830 0 0 const token& tok = j < gold.multiword_tokens.size() && gold.multiword_tokens[j].id_first == int(i) ? (const token&)gold.multiword_tokens[j] : (const token&)gold.words[i];
0 0 const token& tok = j < gold.multiword_tokens.size() && gold.multiword_tokens[j].id_first == int(i) ? (const token&)gold.multiword_tokens[j] : (const token&)gold.words[i];
832 0 0 if (tok.get_space_after())
0 0 if (tok.get_space_after())
833 0 0 plain_text_paragraphs.back().push_back(' ');
836 0 0 if (j < gold.multiword_tokens.size() && gold.multiword_tokens[j].id_first == int(i))
0 0 if (j < gold.multiword_tokens.size() && gold.multiword_tokens[j].id_first == int(i))
0 0 if (j < gold.multiword_tokens.size() && gold.multiword_tokens[j].id_first == int(i))
842 0 0 if (tokenizer == NONE && tagger != NONE) {
843 0 0 system.clear();
844 0 0 for (size_t i = 1; i < gold.words.size(); i++)
847 0 0 if (tagger != NONE) {
848 0 0 if (!m->tag(system, tagger, error))
0 0 if (!m->tag(system, tagger, error))
850 0 0 if (parser != NONE)
851 0 0 if (!m->parse(system, parser, error))
0 0 if (!m->parse(system, parser, error))
854 0 0 system_goldtok_data.add_sentence(system);
858 0 0 if (tokenizer == NONE && tagger == NONE && parser != NONE) {
0 0 if (tokenizer == NONE && tagger == NONE && parser != NONE) {
859 0 0 system.clear();
860 0 0 for (size_t i = 1; i < gold.words.size(); i++) {
867 0 0 if (parser != NONE)
868 0 0 if (!m->parse(system, parser, error))
0 0 if (!m->parse(system, parser, error))
870 0 0 system_goldtok_goldtags_data.add_sentence(system);
873 0 0 if (!error.empty()) return false;
877 0 0 if (tokenizer != NONE) {
878 0 0 unique_ptr t(m->new_tokenizer(tokenizer));
879 0 0 if (!t) return error.assign("Cannot allocate new tokenizer!"), false;
0 0 if (!t) return error.assign("Cannot allocate new tokenizer!"), false;
881 0 0 for (auto&& plain_text : plain_text_paragraphs) {
882 0 0 t->set_text(plain_text);
883 0 0 while (t->next_sentence(system, error)) {
0 0 while (t->next_sentence(system, error)) {
884 0 0 if (tagger != NONE) {
885 0 0 if (!m->tag(system, tagger, error))
0 0 if (!m->tag(system, tagger, error))
888 0 0 if (parser != NONE)
889 0 0 if (!m->parse(system, parser, error))
0 0 if (!m->parse(system, parser, error))
892 0 0 system_plaintext_data.add_sentence(system);
894 0 0 if (!error.empty()) return false;
899 0 0 if (tokenizer != NONE) {
900 0 0 if (system_plaintext_data.chars != gold_data.chars) {
904 0 0 word_alignment::best_alignment(system_plaintext_data, gold_data, plaintext_alignment);
912 0 0 if (multiwords.total_gold || multiwords.total_system)
0 0 if (multiwords.total_gold || multiwords.total_system)
926 0 0 if (tagger != NONE) {
930 0 0 auto alltags = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.upostag == u.upostag && w.xpostag == u.xpostag && w.feats == u.feats; });
0 0 auto alltags = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.upostag == u.upostag && w.xpostag == u.xpostag && w.feats == u.feats; });
0 0 auto alltags = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.upostag == u.upostag && w.xpostag == u.xpostag && w.feats == u.feats; });
938 0 0 if (tagger != NONE && parser != NONE) {
940 0 0 auto las = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head && w.deprel == u.deprel; });
0 0 auto las = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head && w.deprel == u.deprel; });
948 0 0 if (tokenizer == NONE && tagger != NONE) {
950 0 0 if (!word_alignment::perfect_alignment(system_goldtok_data, gold_data, goldtok_alignment))
0 0 if (!word_alignment::perfect_alignment(system_goldtok_data, gold_data, goldtok_alignment))
951 0 0 return error.assign("Internal UDPipe error (the words of the gold data do not match)!"), false;
956 0 0 auto alltags = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.upostag == u.upostag && w.xpostag == u.xpostag && w.feats == u.feats; });
0 0 auto alltags = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.upostag == u.upostag && w.xpostag == u.xpostag && w.feats == u.feats; });
0 0 auto alltags = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.upostag == u.upostag && w.xpostag == u.xpostag && w.feats == u.feats; });
963 0 0 if (parser != NONE) {
965 0 0 auto las = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head && w.deprel == u.deprel; });
0 0 auto las = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head && w.deprel == u.deprel; });
972 0 0 if (tokenizer == NONE && tagger == NONE && parser != NONE) {
0 0 if (tokenizer == NONE && tagger == NONE && parser != NONE) {
974 0 0 if (!word_alignment::perfect_alignment(system_goldtok_goldtags_data, gold_data, goldtok_goldtags_alignment))
0 0 if (!word_alignment::perfect_alignment(system_goldtok_goldtags_data, gold_data, goldtok_goldtags_alignment))
975 0 0 return error.assign("Internal UDPipe error (the words of the goldtok data do not match)!"), false;
978 0 0 auto las = goldtok_goldtags_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head && w.deprel == u.deprel; });
0 0 auto las = goldtok_goldtags_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head && w.deprel == u.deprel; });
989 0 0 for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); )
0 0 for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); )
0 0 for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); )
0 0 for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); )
0 0 for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); )
0 0 for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); )
990 0 0 if (si < system.size() && (gi == gold.size() || system[si].first < gold[gi].first))
0 0 if (si < system.size() && (gi == gold.size() || system[si].first < gold[gi].first))
0 0 if (si < system.size() && (gi == gold.size() || system[si].first < gold[gi].first))
0 0 if (si < system.size() && (gi == gold.size() || system[si].first < gold[gi].first))
0 0 if (si < system.size() && (gi == gold.size() || system[si].first < gold[gi].first))
0 0 if (si < system.size() && (gi == gold.size() || system[si].first < gold[gi].first))
0 0 if (si < system.size() && (gi == gold.size() || system[si].first < gold[gi].first))
0 0 if (si < system.size() && (gi == gold.size() || system[si].first < gold[gi].first))
992 0 0 else if (gi < gold.size() && (si == system.size() || gold[gi].first < system[si].first))
0 0 else if (gi < gold.size() && (si == system.size() || gold[gi].first < system[si].first))
0 0 else if (gi < gold.size() && (si == system.size() || gold[gi].first < system[si].first))
0 0 else if (gi < gold.size() && (si == system.size() || gold[gi].first < system[si].first))
0 0 else if (gi < gold.size() && (si == system.size() || gold[gi].first < system[si].first))
0 0 else if (gi < gold.size() && (si == system.size() || gold[gi].first < system[si].first))
0 0 else if (gi < gold.size() && (si == system.size() || gold[gi].first < system[si].first))
0 0 else if (gi < gold.size() && (si == system.size() || gold[gi].first < system[si].first))
998 0 0 gold.size() ? both / double(gold.size()) : 0., system.size()+gold.size() ? 2 * both / double(system.size() + gold.size()) : 0. };
0 0 gold.size() ? both / double(gold.size()) : 0., system.size()+gold.size() ? 2 * both / double(system.size() + gold.size()) : 0. };
0 0 gold.size() ? both / double(gold.size()) : 0., system.size()+gold.size() ? 2 * both / double(system.size() + gold.size()) : 0. };
0 0 gold.size() ? both / double(gold.size()) : 0., system.size()+gold.size() ? 2 * both / double(system.size() + gold.size()) : 0. };
0 0 gold.size() ? both / double(gold.size()) : 0., system.size()+gold.size() ? 2 * both / double(system.size() + gold.size()) : 0. };
0 0 gold.size() ? both / double(gold.size()) : 0., system.size()+gold.size() ? 2 * both / double(system.size() + gold.size()) : 0. };
1006 0 0 this->w.head = w.head ? id + (w.head - w.id) : 0;
1014 0 0 if (colon != string::npos)
1015 0 0 this->w.deprel.erase(colon);
1020 0 0 for (size_t i = 1, j = 0; i < s.words.size(); i++) {
1022 0 0 const string& form = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? s.multiword_tokens[j].form : s.words[i].form;
0 0 const string& form = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? s.multiword_tokens[j].form : s.words[i].form;
1023 0 0 for (auto&& chr : unilib::utf8::decoder(form))
1024 0 0 if (chr != ' ')
1028 0 0 if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) {
0 0 if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) {
0 0 if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) {
1030 0 0 for (size_t k = i; int(k) <= s.multiword_tokens[j].id_last; k++) {
1045 0 0 for (auto&& match : matched)
0 0 for (auto&& match : matched)
0 0 for (auto&& match : matched)
0 0 for (auto&& match : matched)
0 0 for (auto&& match : matched)
0 0 for (auto&& match : matched)
0 0 for (auto&& match : matched)
0 0 for (auto&& match : matched)
0 0 for (auto&& match : matched)
0 0 for (auto&& match : matched)
0 0 for (auto&& match : matched)
0 0 for (auto&& match : matched)
0 0 for (auto&& match : matched)
0 0 for (auto&& match : matched)
0 0 for (auto&& match : matched)
0 0 for (auto&& match : matched)
0 0 for (auto&& match : matched)
1046 0 0 if (equals(match.system, match.gold))
0 0 if (equals(match.system, match.gold))
0 0 if (equals(match.system, match.gold))
0 0 if (equals(match.system, match.gold))
1050 0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
0 0 total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
1056 0 0 if (alignment.total_system != alignment.total_gold) return false;
1060 0 0 for (size_t i = 0; i < system.words.size(); i++) {
1061 0 0 if (system.words[i].w.form != gold.words[i].w.form)
1074 0 0 for (size_t si = 0, gi = 0; si < system.words.size() && gi < gold.words.size(); )
0 0 for (size_t si = 0, gi = 0; si < system.words.size() && gi < gold.words.size(); )
0 0 for (size_t si = 0, gi = 0; si < system.words.size() && gi < gold.words.size(); )
1075 0 0 if ((system.words[si].start > gold.words[gi].start || !system.words[si].is_multiword) &&
0 0 if ((system.words[si].start > gold.words[gi].start || !system.words[si].is_multiword) &&
0 0 if ((system.words[si].start > gold.words[gi].start || !system.words[si].is_multiword) &&
0 0 if ((system.words[si].start > gold.words[gi].start || !system.words[si].is_multiword) &&
1076 0 0 (gold.words[gi].start > system.words[si].start || !gold.words[gi].is_multiword)) {
1078 0 0 if (system.words[si].start == gold.words[gi].start && system.words[si].end == gold.words[gi].end)
0 0 if (system.words[si].start == gold.words[gi].start && system.words[si].end == gold.words[gi].end)
0 0 if (system.words[si].start == gold.words[gi].start && system.words[si].end == gold.words[gi].end)
1080 0 0 else if (system.words[si].start <= gold.words[gi].start)
1086 0 0 size_t ss = si, gs = gi, multiword_range_end = system.words[si].is_multiword ? system.words[si].end : gold.words[gi].end;
1089 0 0 while ((si < system.words.size() && (system.words[si].is_multiword ? system.words[si].start < multiword_range_end :
0 0 while ((si < system.words.size() && (system.words[si].is_multiword ? system.words[si].start < multiword_range_end :
0 0 while ((si < system.words.size() && (system.words[si].is_multiword ? system.words[si].start < multiword_range_end :
1090 0 0 system.words[si].end <= multiword_range_end)) ||
0 0 system.words[si].end <= multiword_range_end)) ||
0 0 system.words[si].end <= multiword_range_end)) ||
0 0 system.words[si].end <= multiword_range_end)) ||
1091 0 0 (gi < gold.words.size() && (gold.words[gi].is_multiword ? gold.words[gi].start < multiword_range_end :
0 0 (gi < gold.words.size() && (gold.words[gi].is_multiword ? gold.words[gi].start < multiword_range_end :
1094 0 0 if (si < system.words.size() && (gi >= gold.words.size() || system.words[si].start <= gold.words[gi].start)) {
0 0 if (si < system.words.size() && (gi >= gold.words.size() || system.words[si].start <= gold.words[gi].start)) {
0 0 if (si < system.words.size() && (gi >= gold.words.size() || system.words[si].start <= gold.words[gi].start)) {
0 0 if (si < system.words.size() && (gi >= gold.words.size() || system.words[si].start <= gold.words[gi].start)) {
1095 0 0 if (system.words[si].is_multiword) multiword_range_end = max(multiword_range_end, system.words[si].end);
1098 0 0 if (gold.words[gi].is_multiword) multiword_range_end = max(multiword_range_end, gold.words[gi].end);
1105 0 0 for (unsigned s = si - ss; s--; ) {
1106 0 0 lcs[s].resize(gi - gs);
1107 0 0 for (unsigned g = gi - gs; g--; ) {
1108 0 0 lcs[s][g] = max(lcs[s][g], s+1 < lcs.size() ? lcs[s+1][g] : 0);
1109 0 0 lcs[s][g] = max(lcs[s][g], g+1 < lcs[s].size() ? lcs[s][g+1] : 0);
1110 0 0 if (system.words[ss + s].w.form == gold.words[gs + g].w.form)
1111 0 0 lcs[s][g] = max(lcs[s][g], 1 + (s+1 < lcs.size() && g+1 < lcs[s].size() ? lcs[s+1][g+1] : 0));
0 0 lcs[s][g] = max(lcs[s][g], 1 + (s+1 < lcs.size() && g+1 < lcs[s].size() ? lcs[s+1][g+1] : 0));
1115 0 0 for (unsigned s = 0, g = 0; s < si - ss && g < gi - gs; ) {
0 0 for (unsigned s = 0, g = 0; s < si - ss && g < gi - gs; ) {
1116 0 0 if (system.words[ss + s].w.form == gold.words[gs + g].w.form)
1117 0 0 alignment.matched.emplace_back(system.words[ss + s++].w, gold.words[gs + g++].w);
1118 0 0 else if (lcs[s][g] == (s+1 < lcs.size() ? lcs[s+1][g] : 0))
0 0 else if (lcs[s][g] == (s+1 < lcs.size() ? lcs[s+1][g] : 0))
1127 0 0 for (auto&& match : alignment.matched)
1129 0 0 for (auto&& match : alignment.matched)
1130 0 0 if (match.system.head > 0)
1395 0 0 class node {
1430 0 0 class tree {
0 0 class tree {
0 0 class tree {
0 0 class tree {
0 0 class tree {
1499 0 0 explicit binary_decoder_error(const char* description) : runtime_error(description) {}
0 0 explicit binary_decoder_error(const char* description) : runtime_error(description) {}
0 0 explicit binary_decoder_error(const char* description) : runtime_error(description) {}
0 0 explicit binary_decoder_error(const char* description) : runtime_error(description) {}
0 0 explicit binary_decoder_error(const char* description) : runtime_error(description) {}
0 0 explicit binary_decoder_error(const char* description) : runtime_error(description) {}
0 0 explicit binary_decoder_error(const char* description) : runtime_error(description) {}
0 0 explicit binary_decoder_error(const char* description) : runtime_error(description) {}
0 0 explicit binary_decoder_error(const char* description) : runtime_error(description) {}
0 0 explicit binary_decoder_error(const char* description) : runtime_error(description) {}
0 0 explicit binary_decoder_error(const char* description) : runtime_error(description) {}
1502 0 0 class binary_decoder {
0 0 class binary_decoder {
0 0 class binary_decoder {
0 0 class binary_decoder {
0 0 class binary_decoder {
0 0 class binary_decoder {
0 0 class binary_decoder {
0 0 class binary_decoder {
0 0 class binary_decoder {
1527 6 0 buffer.resize(len);
1535 0 1308 if (data + 1 > data_end) throw binary_decoder_error("No more data in binary_decoder");
1540 0 26 if (data + sizeof(uint16_t) > data_end) throw binary_decoder_error("No more data in binary_decoder");
1548 0 1573 if (data + sizeof(uint32_t) > data_end) throw binary_decoder_error("No more data in binary_decoder");
1557 1 35 if (len == 255) len = next_4B();
1562 0 603 if (data + sizeof(T) * elements > data_end) throw binary_decoder_error("No more data in binary_decoder");
0 484 if (data + sizeof(T) * elements > data_end) throw binary_decoder_error("No more data in binary_decoder");
0 185 if (data + sizeof(T) * elements > data_end) throw binary_decoder_error("No more data in binary_decoder");
1577 0 1 if (pos > buffer.size()) throw binary_decoder_error("Cannot seek past end of binary_decoder");
1683 34 0 while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v'))
34 0 while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v'))
34 0 while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v'))
34 0 while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v'))
34 0 while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v'))
0 34 while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v'))
1688 34 0 if (str.len && (str.str[0] == '+' || str.str[0] == '-')) {
8 26 if (str.len && (str.str[0] == '+' || str.str[0] == '-')) {
1694 0 34 if (!str.len) return error.assign("Cannot parse ").append(value_name).append(" int value '").append(original.str, original.len).append("': empty string."), false;
1698 34 34 while (str.len && str.str[0] >= '0' && str.str[0] <= '9') {
0 34 while (str.len && str.str[0] >= '0' && str.str[0] <= '9') {
0 34 while (str.len && str.str[0] >= '0' && str.str[0] <= '9') {
1699 26 8 if (positive) {
1700 0 26 if (value > (numeric_limits::max() - (str.str[0] - '0')) / 10)
1704 0 8 if (value < (numeric_limits::min() + (str.str[0] - '0')) / 10)
1712 0 34 while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v'))
0 0 while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v'))
0 0 while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v'))
0 0 while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v'))
0 0 while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v'))
0 0 while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v'))
1716 0 34 if (str.len) return error.assign("Cannot parse ").append(value_name).append(" int value '").append(original.str, original.len).append("': non-digit character found."), false;
1725 0 0 if (!parse_int(str, value_name, result, error))
0 0 if (!parse_int(str, value_name, result, error))
1866 0 3 for (size_t start = 0; start < values.size(); ) {
1867 0 0 while (start < values.size() && values[start] == ';') start++;
0 0 while (start < values.size() && values[start] == ';') start++;
0 0 while (start < values.size() && values[start] == ';') start++;
1868 0 0 if (start >= values.size()) break;
1871 0 0 name.assign(values, start, name_end - start);
1874 0 0 if (name_end == string::npos) {
1876 0 0 } else if (values[name_end] == ';') {
1881 0 0 if (equal_sign + 1 + 5 <= values.size() && values.compare(equal_sign + 1, 5, "file:") == 0) {
0 0 if (equal_sign + 1 + 5 <= values.size() && values.compare(equal_sign + 1, 5, "file:") == 0) {
0 0 if (equal_sign + 1 + 5 <= values.size() && values.compare(equal_sign + 1, 5, "file:") == 0) {
0 0 if (equal_sign + 1 + 5 <= values.size() && values.compare(equal_sign + 1, 5, "file:") == 0) {
1886 0 0 file.assign(values, file_name, semicolon - file_name);
1887 0 0 ifstream is(path_from_utf8(file).c_str());
1888 0 0 if (!is.is_open()) return error.assign("Cannot open file '").append(file).append("'!"), false;
0 0 if (!is.is_open()) return error.assign("Cannot open file '").append(file).append("'!"), false;
0 0 if (!is.is_open()) return error.assign("Cannot open file '").append(file).append("'!"), false;
1891 0 0 for (value.clear(); is.read(buffer, sizeof(buffer)); )
0 0 for (value.clear(); is.read(buffer, sizeof(buffer)); )
1892 0 0 value.append(buffer, sizeof(buffer));
1893 0 0 value.append(buffer, is.gcount());
1896 0 0 } else if (equal_sign + 1 + 5 <= values.size() && values.compare(equal_sign + 1, 5, "data:") == 0) {
0 0 } else if (equal_sign + 1 + 5 <= values.size() && values.compare(equal_sign + 1, 5, "data:") == 0) {
0 0 } else if (equal_sign + 1 + 5 <= values.size() && values.compare(equal_sign + 1, 5, "data:") == 0) {
0 0 } else if (equal_sign + 1 + 5 <= values.size() && values.compare(equal_sign + 1, 5, "data:") == 0) {
1900 0 0 if (data_size_end == string::npos) return error.assign("Cannot parse named values, data size of value '").append(name).append("' not terminated!"), false;
0 0 if (data_size_end == string::npos) return error.assign("Cannot parse named values, data size of value '").append(name).append("' not terminated!"), false;
0 0 if (data_size_end == string::npos) return error.assign("Cannot parse named values, data size of value '").append(name).append("' not terminated!"), false;
1903 0 0 if (!parse_int(string_piece(values.c_str() + data_size_start, data_size_end - data_size_start), "data_size", data_size, error)) return false;
0 0 if (!parse_int(string_piece(values.c_str() + data_size_start, data_size_end - data_size_start), "data_size", data_size, error)) return false;
1906 0 0 if (data_end > values.size()) return error.assign("Cannot parse named values, value '").append(name).append("' shorter than specified length!"), false;
0 0 if (data_end > values.size()) return error.assign("Cannot parse named values, value '").append(name).append("' shorter than specified length!"), false;
0 0 if (data_end > values.size()) return error.assign("Cannot parse named values, value '").append(name).append("' shorter than specified length!"), false;
1907 0 0 if (data_end < values.size() && values[data_end] != ';') return error.assign("Cannot parse named values, value '").append(name).append("' not terminated by semicolon!"), false;
0 0 if (data_end < values.size() && values[data_end] != ';') return error.assign("Cannot parse named values, value '").append(name).append("' not terminated by semicolon!"), false;
0 0 if (data_end < values.size() && values[data_end] != ';') return error.assign("Cannot parse named values, value '").append(name).append("' not terminated by semicolon!"), false;
0 0 if (data_end < values.size() && values[data_end] != ';') return error.assign("Cannot parse named values, value '").append(name).append("' not terminated by semicolon!"), false;
0 0 if (data_end < values.size() && values[data_end] != ';') return error.assign("Cannot parse named values, value '").append(name).append("' not terminated by semicolon!"), false;
1909 0 0 value.assign(values, data_start, data_end - data_start);
1914 0 0 value.assign(values, equal_sign + 1, semicolon - equal_sign - 1);
1961 0 0 while (lock.test_and_set(memory_order_acquire)) {}
0 0 while (lock.test_and_set(memory_order_acquire)) {}
0 1 while (lock.test_and_set(memory_order_acquire)) {}
0 1 while (lock.test_and_set(memory_order_acquire)) {}
0 1 while (lock.test_and_set(memory_order_acquire)) {}
0 1 while (lock.test_and_set(memory_order_acquire)) {}
1970 0 0 while (lock.test_and_set(memory_order_acquire)) {}
0 0 while (lock.test_and_set(memory_order_acquire)) {}
0 1 while (lock.test_and_set(memory_order_acquire)) {}
0 1 while (lock.test_and_set(memory_order_acquire)) {}
0 1 while (lock.test_and_set(memory_order_acquire)) {}
0 1 while (lock.test_and_set(memory_order_acquire)) {}
1971 0 0 if (!stack.empty()) {
0 0 if (!stack.empty()) {
0 1 if (!stack.empty()) {
0 1 if (!stack.empty()) {
0 1 if (!stack.empty()) {
0 1 if (!stack.empty()) {
2027 1 0 struct parser_cache {
2090 1 0 ifstream in(path_from_utf8(fname).c_str(), ifstream::in | ifstream::binary);
2091 1 0 if (!in.is_open()) return nullptr;
2092 1 0 return load(in);
2097 1 0 if (!is.get(len)) return nullptr;
2099 1 0 if (!is.read(&name[0], len)) return nullptr;
1 0 if (!is.read(&name[0], len)) return nullptr;
2101 1 0 if (name == "morphodita_parsito") return model_morphodita_parsito::load(is);
1 0 if (name == "morphodita_parsito") return model_morphodita_parsito::load(is);
2245 0 0 for (string line; getline(is, line); ) {
0 0 for (string line; getline(is, line); ) {
2247 0 0 para.push_back('\n');
2249 0 0 if (line.empty()) break;
2252 0 0 if (is.eof() && !para.empty()) is.clear(istream::eofbit);
0 0 if (is.eof() && !para.empty()) is.clear(istream::eofbit);
0 0 if (is.eof() && !para.empty()) is.clear(istream::eofbit);
2294 0 0 while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v'))
0 0 while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v'))
0 0 while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v'))
0 0 while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v'))
0 0 while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v'))
0 0 while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v'))
2299 0 0 if (str.len && (str.str[0] == '+' || str.str[0] == '-')) {
0 0 if (str.len && (str.str[0] == '+' || str.str[0] == '-')) {
2305 0 0 if (!str.len) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': empty string."), false;
2309 0 0 while (str.len && str.str[0] >= '0' && str.str[0] <= '9') {
0 0 while (str.len && str.str[0] >= '0' && str.str[0] <= '9') {
0 0 while (str.len && str.str[0] >= '0' && str.str[0] <= '9') {
2315 0 0 if (str.len && str.str[0] == '.') {
0 0 if (str.len && str.str[0] == '.') {
2319 0 0 while (str.len && str.str[0] >= '0' && str.str[0] <= '9') {
0 0 while (str.len && str.str[0] >= '0' && str.str[0] <= '9') {
0 0 while (str.len && str.str[0] >= '0' && str.str[0] <= '9') {
2327 0 0 if (!isfinite(value)) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': overflow occured."), false;
2330 0 0 if (str.len && (str.str[0] == 'e' || str.str[0] == 'E')) {
0 0 if (str.len && (str.str[0] == 'e' || str.str[0] == 'E')) {
2335 0 0 if (str.len && (str.str[0] == '+' || str.str[0] == '-')) {
0 0 if (str.len && (str.str[0] == '+' || str.str[0] == '-')) {
2340 0 0 while (str.len && str.str[0] >= '0' && str.str[0] <= '9') {
0 0 while (str.len && str.str[0] >= '0' && str.str[0] <= '9') {
0 0 while (str.len && str.str[0] >= '0' && str.str[0] <= '9') {
2345 0 0 exponent = pow(10., exponent_negative ? -exponent : exponent);
2346 0 0 if (!isfinite(exponent)) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': exponent overflow occured."), false;
2347 0 0 if (exponent == 0) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': exponent underflow occured."), false;
2349 0 0 if (value) {
2351 0 0 if (!isfinite(value)) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': overflow occured."), false;
2352 0 0 if (value == 0) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': underflow occured."), false;
2357 0 0 if (negative) value *= -1;
2360 0 0 while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v'))
0 0 while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v'))
0 0 while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v'))
0 0 while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v'))
0 0 while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v'))
0 0 while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v'))
2364 0 0 if (str.len) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': non-digit character found."), false;
2373 0 0 if (!parse_double(str, value_name, result, error))
0 0 if (!parse_double(str, value_name, result, error))
2400 1 0 if (!tokenizer_factory)
2405 1 0 if (!named_values::parse(options, parsed_options, parse_error))
1 0 if (!named_values::parse(options, parsed_options, parse_error))
2408 1 0 bool normalized_spaces = parsed_options.count("normalized_spaces");
2409 1 0 bool token_ranges = parsed_options.count("ranges");
2411 1 0 const auto* morpho = !taggers.empty() ? taggers[0].tagger->get_morpho() : nullptr;
1 0 const auto* morpho = !taggers.empty() ? taggers[0].tagger->get_morpho() : nullptr;
2412 1 0 unique_ptr result(new morphodita_tokenizer_wrapper(tokenizer_factory->new_tokenizer(morpho), splitter.get(), normalized_spaces, token_ranges));
1 0 unique_ptr result(new morphodita_tokenizer_wrapper(tokenizer_factory->new_tokenizer(morpho), splitter.get(), normalized_spaces, token_ranges));
1 0 unique_ptr result(new morphodita_tokenizer_wrapper(tokenizer_factory->new_tokenizer(morpho), splitter.get(), normalized_spaces, token_ranges));
2415 0 0 if (parsed_options.count("presegmented") && result)
0 1 if (parsed_options.count("presegmented") && result)
2416 0 0 result.reset(input_format::new_presegmented_tokenizer(result.release()));
2419 0 0 if (parsed_options.count("joint_with_parsing") && result) {
0 1 if (parsed_options.count("joint_with_parsing") && result) {
2421 0 0 if (parsed_options.count("joint_max_sentence_len") && !parse_int(parsed_options["joint_max_sentence_len"], "joint max sentence len", max_sentence_len, parse_error))
0 0 if (parsed_options.count("joint_max_sentence_len") && !parse_int(parsed_options["joint_max_sentence_len"], "joint max sentence len", max_sentence_len, parse_error))
0 0 if (parsed_options.count("joint_max_sentence_len") && !parse_int(parsed_options["joint_max_sentence_len"], "joint max sentence len", max_sentence_len, parse_error))
0 0 if (parsed_options.count("joint_max_sentence_len") && !parse_int(parsed_options["joint_max_sentence_len"], "joint max sentence len", max_sentence_len, parse_error))
0 0 if (parsed_options.count("joint_max_sentence_len") && !parse_int(parsed_options["joint_max_sentence_len"], "joint max sentence len", max_sentence_len, parse_error))
0 0 if (parsed_options.count("joint_max_sentence_len") && !parse_int(parsed_options["joint_max_sentence_len"], "joint max sentence len", max_sentence_len, parse_error))
0 0 if (parsed_options.count("joint_max_sentence_len") && !parse_int(parsed_options["joint_max_sentence_len"], "joint max sentence len", max_sentence_len, parse_error))
2425 0 0 if (parsed_options.count("joint_change_boundary_logprob") && !parse_double(parsed_options["joint_change_boundary_logprob"], "joint change boundary logprob", change_boundary_logprob, parse_error))
0 0 if (parsed_options.count("joint_change_boundary_logprob") && !parse_double(parsed_options["joint_change_boundary_logprob"], "joint change boundary logprob", change_boundary_logprob, parse_error))
0 0 if (parsed_options.count("joint_change_boundary_logprob") && !parse_double(parsed_options["joint_change_boundary_logprob"], "joint change boundary logprob", change_boundary_logprob, parse_error))
0 0 if (parsed_options.count("joint_change_boundary_logprob") && !parse_double(parsed_options["joint_change_boundary_logprob"], "joint change boundary logprob", change_boundary_logprob, parse_error))
0 0 if (parsed_options.count("joint_change_boundary_logprob") && !parse_double(parsed_options["joint_change_boundary_logprob"], "joint change boundary logprob", change_boundary_logprob, parse_error))
0 0 if (parsed_options.count("joint_change_boundary_logprob") && !parse_double(parsed_options["joint_change_boundary_logprob"], "joint change boundary logprob", change_boundary_logprob, parse_error))
0 0 if (parsed_options.count("joint_change_boundary_logprob") && !parse_double(parsed_options["joint_change_boundary_logprob"], "joint change boundary logprob", change_boundary_logprob, parse_error))
2429 0 0 if (parsed_options.count("joint_sentence_logprob") && !parse_double(parsed_options["joint_sentence_logprob"], "joint sentence logprob", sentence_logprob, parse_error))
0 0 if (parsed_options.count("joint_sentence_logprob") && !parse_double(parsed_options["joint_sentence_logprob"], "joint sentence logprob", sentence_logprob, parse_error))
0 0 if (parsed_options.count("joint_sentence_logprob") && !parse_double(parsed_options["joint_sentence_logprob"], "joint sentence logprob", sentence_logprob, parse_error))
0 0 if (parsed_options.count("joint_sentence_logprob") && !parse_double(parsed_options["joint_sentence_logprob"], "joint sentence logprob", sentence_logprob, parse_error))
0 0 if (parsed_options.count("joint_sentence_logprob") && !parse_double(parsed_options["joint_sentence_logprob"], "joint sentence logprob", sentence_logprob, parse_error))
0 0 if (parsed_options.count("joint_sentence_logprob") && !parse_double(parsed_options["joint_sentence_logprob"], "joint sentence logprob", sentence_logprob, parse_error))
0 0 if (parsed_options.count("joint_sentence_logprob") && !parse_double(parsed_options["joint_sentence_logprob"], "joint sentence logprob", sentence_logprob, parse_error))
2432 0 0 result.reset(new joint_with_parsing_tokenizer(result.release(), *this, max_sentence_len, change_boundary_logprob, sentence_logprob));
2441 0 1 if (taggers.empty()) return error.assign("No tagger defined for the UDPipe model!"), false;
2442 1 0 if (s.empty()) return true;
2445 1 0 if (!c) c = new tagger_cache();
2450 1 7 for (size_t i = 1; i < s.words.size(); i++)
2454 7 1 for (size_t i = 1; i < s.words.size(); i++) {
2462 1 1 for (auto&& tagger : taggers) {
2463 0 1 if (!tagger.tagger) return error.assign("No tagger defined for the UDPipe model!"), false;
2467 7 1 for (size_t i = 0; i < c->lemmas.size(); i++)
2472 1 0 if (taggers.size() == 1 && taggers[0].raw && taggers[0].tagger->get_morpho()) {
0 1 if (taggers.size() == 1 && taggers[0].raw && taggers[0].tagger->get_morpho()) {
0 0 if (taggers.size() == 1 && taggers[0].raw && taggers[0].tagger->get_morpho()) {
0 1 if (taggers.size() == 1 && taggers[0].raw && taggers[0].tagger->get_morpho()) {
2474 0 0 for (size_t i = 0; i < c->forms_string_pieces.size(); i++) {
2475 0 0 if (morpho->analyze(c->forms_string_pieces[i], morphodita::morpho::GUESSER, c->lemmas) == morphodita::morpho::GUESSER)
2476 0 0 s.words[i + 1].misc.append(s.words[i + 1].misc.empty() ? "" : "|").append("MorphoGuesser=Yes");
2491 0 1 if (!parser) return error.assign("No parser defined for the UDPipe model!"), false;
2492 1 0 if (s.empty()) return true;
2495 1 0 if (!c) c = new parser_cache();
2498 1 0 if (!named_values::parse(options, c->options, error))
2500 0 1 if (c->options.count("beam_search"))
2501 0 0 if (!parse_int(c->options["beam_search"], "beam_search", beam_search, error))
0 0 if (!parse_int(c->options["beam_search"], "beam_search", beam_search, error))
2505 7 1 for (size_t i = 1; i < s.words.size(); i++) {
2517 7 1 for (size_t i = 1; i < s.words.size(); i++)
2526 1 0 if (!is.get(version)) return nullptr;
2527 1 0 if (!(version >= 1 && version <= VERSION_LATEST)) return nullptr;
2532 0 1 if (version >= 2) {
2534 0 0 if (!is.get(sentinel) || sentinel != 0x7F) return nullptr;
0 0 if (!is.get(sentinel) || sentinel != 0x7F) return nullptr;
0 0 if (!is.get(sentinel) || sentinel != 0x7F) return nullptr;
2535 0 0 if (!is.get(sentinel) || sentinel != 0x7F) return nullptr;
0 0 if (!is.get(sentinel) || sentinel != 0x7F) return nullptr;
0 0 if (!is.get(sentinel) || sentinel != 0x7F) return nullptr;
2539 1 0 if (!m) return nullptr;
2542 1 0 if (!is.get(tokenizer)) return nullptr;
1 0 if (!is.get(tokenizer)) return nullptr;
2543 1 0 m->tokenizer_factory.reset(tokenizer ? morphodita::tokenizer_factory::load(is) : nullptr);
1 0 m->tokenizer_factory.reset(tokenizer ? morphodita::tokenizer_factory::load(is) : nullptr);
2544 1 0 if (tokenizer && !m->tokenizer_factory) return nullptr;
1 0 if (tokenizer && !m->tokenizer_factory) return nullptr;
1 0 if (tokenizer && !m->tokenizer_factory) return nullptr;
2545 1 0 m->splitter.reset(tokenizer ? multiword_splitter::load(is) : nullptr);
1 0 m->splitter.reset(tokenizer ? multiword_splitter::load(is) : nullptr);
2546 1 0 if (tokenizer && !m->splitter) return nullptr;
1 0 if (tokenizer && !m->splitter) return nullptr;
1 0 if (tokenizer && !m->splitter) return nullptr;
2549 1 0 char taggers; if (!is.get(taggers)) return nullptr;
1 0 char taggers; if (!is.get(taggers)) return nullptr;
2550 1 1 for (char i = 0; i < taggers; i++) {
2551 1 0 char lemma; if (!is.get(lemma)) return nullptr;
1 0 char lemma; if (!is.get(lemma)) return nullptr;
2552 1 0 char xpostag; if (!is.get(xpostag)) return nullptr;
1 0 char xpostag; if (!is.get(xpostag)) return nullptr;
2553 1 0 char feats; if (!is.get(feats)) return nullptr;
1 0 char feats; if (!is.get(feats)) return nullptr;
2554 1 0 int model_type = is.peek();
2557 1 0 model_type == morphodita::tagger_ids::CONLLU3);
1 0 model_type == morphodita::tagger_ids::CONLLU3);
2558 1 0 morphodita::tagger* tagger = morphodita::tagger::load(is);
2559 1 0 if (!tagger) return nullptr;
2560 1 0 m->taggers.emplace_back(raw, i == 0, int(lemma), bool(xpostag), bool(feats), tagger);
2564 1 0 if (!is.get(parser)) return nullptr;
1 0 if (!is.get(parser)) return nullptr;
2565 1 0 m->parser.reset(parser ? parsito::parser::load(is) : nullptr);
1 0 m->parser.reset(parser ? parsito::parser::load(is) : nullptr);
2566 1 0 if (parser && !m->parser) return nullptr;
1 0 if (parser && !m->parser) return nullptr;
1 0 if (parser && !m->parser) return nullptr;
2576 0 0 for (string line; getline(is, line); ) {
0 0 for (string line; getline(is, line); ) {
2578 0 0 block.push_back('\n');
2581 0 0 if (is.eof() && !block.empty()) is.clear(istream::eofbit);
0 0 if (is.eof() && !block.empty()) is.clear(istream::eofbit);
0 0 if (is.eof() && !block.empty()) is.clear(istream::eofbit);
2595 0 0 if (make_copy) {
2605 0 0 if (text.len) {
2613 0 0 while (tokenizer->next_sentence(input, error)) {
0 0 while (tokenizer->next_sentence(input, error)) {
2614 0 0 if (input.get_new_par() && !paragraph.empty()) {
0 0 if (input.get_new_par() && !paragraph.empty()) {
0 0 if (input.get_new_par() && !paragraph.empty()) {
0 0 if (input.get_new_par() && !paragraph.empty()) {
2615 0 0 if (!parse_paragraph(paragraph, error)) return false;
0 0 if (!parse_paragraph(paragraph, error)) return false;
2616 0 0 for (auto&& sentence : paragraph)
2617 0 0 sentences.push_back(sentence);
2620 0 0 paragraph.push_back(input);
2622 0 0 if (!error.empty()) return false;
2624 0 0 if (!paragraph.empty()) {
2625 0 0 if (!parse_paragraph(paragraph, error)) return false;
0 0 if (!parse_paragraph(paragraph, error)) return false;
2626 0 0 for (auto&& sentence : paragraph)
2627 0 0 sentences.push_back(sentence);
2633 0 0 if (sentences_index < sentences.size()) {
2643 0 0 vector sentence_boundary(1, true);
2644 0 0 vector token_boundary(1, true);
2646 0 0 for (auto&& s : paragraph) {
2648 0 0 for (unsigned i = 1; i < s.words.size(); i++) {
2649 0 0 all_words.words.push_back(s.words[i]);
2651 0 0 sentence_boundary.push_back(i+1 == s.words.size());
2652 0 0 token_boundary.push_back(true);
2655 0 0 for (auto&& mwt : s.multiword_tokens) {
2656 0 0 all_words.multiword_tokens.push_back(mwt);
2659 0 0 for (int i = all_words.multiword_tokens.back().id_first; i < all_words.multiword_tokens.back().id_last; i++)
2664 0 0 vector best_logprob(all_words.words.size(), -numeric_limits::infinity()); best_logprob[0] = 0.;
2665 0 0 vector best_length(all_words.words.size(), 0);
2666 0 0 sentence s;
2668 0 0 for (unsigned start = 1; start < all_words.words.size(); start++) {
2669 0 0 if (!token_boundary[start - 1]) continue;
2670 0 0 s.clear();
2671 0 0 for (unsigned end = start + 1; end <= all_words.words.size() && (end - start) <= unsigned(max_sentence_len); end++) {
0 0 for (unsigned end = start + 1; end <= all_words.words.size() && (end - start) <= unsigned(max_sentence_len); end++) {
0 0 for (unsigned end = start + 1; end <= all_words.words.size() && (end - start) <= unsigned(max_sentence_len); end++) {
2672 0 0 s.words.push_back(all_words.words[end - 1]);
2674 0 0 if (!token_boundary[end - 1]) continue;
2676 0 0 for (unsigned i = 1; i < s.words.size(); i++) {
2682 0 0 if (!model.parse(s, DEFAULT, error, &cost)) return false;
0 0 if (!model.parse(s, DEFAULT, error, &cost)) return false;
2684 0 0 if (best_logprob[start - 1] + cost > best_logprob[end - 1]) {
2692 0 0 for (unsigned end = all_words.words.size(); end > 1; end -= best_length[end - 1])
2693 0 0 sentence_lengths.push_back(best_length[end - 1]);
2699 0 0 for (unsigned i = 1; i < sentence_lengths.size(); i++) {
2702 0 0 paragraph.emplace_back();
2703 0 0 while (!all_words.multiword_tokens.empty() && unsigned(all_words.multiword_tokens.front().id_first) < sentence_lengths[i]) {
0 0 while (!all_words.multiword_tokens.empty() && unsigned(all_words.multiword_tokens.front().id_first) < sentence_lengths[i]) {
0 0 while (!all_words.multiword_tokens.empty() && unsigned(all_words.multiword_tokens.front().id_first) < sentence_lengths[i]) {
2704 0 0 paragraph.back().multiword_tokens.push_back(all_words.multiword_tokens.front());
2710 0 0 for (unsigned word = sentence_lengths[i - 1]; word < sentence_lengths[i]; word++) {
2711 0 0 paragraph.back().words.push_back(all_words.words[word]);
2718 0 0 if (!paragraph.empty()) {
2719 0 0 if (new_document) {
2720 0 0 paragraph.front().set_new_doc(true, document_id);
2724 0 0 paragraph.front().set_new_par(true);
2732 0 7 if (raw) {
2733 0 0 if (lemma) word.lemma.assign(analysis.lemma);
2734 0 0 if (xpostag) word.xpostag.assign(analysis.tag);
2739 7 0 if (lemma == 1) {
2741 0 0 } else if (lemma == 2) {
2745 0 0 if (analysis.lemma[0] == '~') {
2747 0 0 if (end != string::npos) {
2749 0 0 if (analysis.lemma.compare(end + 1, string::npos, word.lemma) == 0)
2756 0 7 if (version == 2) {
2758 0 0 for (auto && chr : word.lemma)
2759 0 0 if (chr == '\001')
2761 0 7 } else if (version >= 3) {
2763 0 0 for (size_t i = 0; i + 1 < word.lemma.size(); i++)
2764 0 0 if (word.lemma[i] == char(0xC2) && word.lemma[i+1] == char(0xA0))
0 0 if (word.lemma[i] == char(0xC2) && word.lemma[i+1] == char(0xA0))
0 0 if (word.lemma[i] == char(0xC2) && word.lemma[i+1] == char(0xA0))
2768 0 7 if (!upostag && !xpostag && !feats) return;
0 0 if (!upostag && !xpostag && !feats) return;
2773 7 0 if (upostag) word.upostag.assign(analysis.tag, start, end - start);
2775 7 0 if (!xpostag && !feats) return;
2780 7 0 if (xpostag) word.xpostag.assign(analysis.tag, start, end - start);
2782 7 0 if (!feats) return;
2793 14 0 if (version <= 1) return output.assign(form.str, form.len);
2835 0 0 for (auto&& chr : utf8::decoder(form.str, form.len)) {
2837 0 0 if (chr == 0x640 || (chr >= 0x64B && chr <= 0x657) || chr == 0x670) {}
0 0 if (chr == 0x640 || (chr >= 0x64B && chr <= 0x657) || chr == 0x670) {}
2838 0 0 else if (chr == 0x622) utf8::append(output, 0x627);
2839 0 0 else if (chr == 0x623) utf8::append(output, 0x627);
2840 0 0 else if (chr == 0x624) utf8::append(output, 0x648);
2841 0 0 else if (chr == 0x625) utf8::append(output, 0x627);
2842 0 0 else if (chr == 0x626) utf8::append(output, 0x64A);
2843 0 0 else if (chr == 0x671) utf8::append(output, 0x627);
2844 0 0 else if (chr == 0x6A9) utf8::append(output, 0x643);
2845 0 0 else if (chr == 0x6AA) utf8::append(output, 0x643);
2846 0 0 else if (chr == 0x6CC) utf8::append(output, 0x64A);
2848 0 0 else if (chr == ' ' && version == 2) utf8::append(output, 0x01);
0 0 else if (chr == ' ' && version == 2) utf8::append(output, 0x01);
2849 0 0 else if (chr == ' ' && version >= 3) utf8::append(output, 0xA0);
0 0 else if (chr == ' ' && version >= 3) utf8::append(output, 0xA0);
2855 0 0 if (output.empty() && form.len)
0 0 if (output.empty() && form.len)
0 0 if (output.empty() && form.len)
2865 7 0 if (version <= 2) return output.assign(lemma.str, lemma.len);
2869 0 0 for (size_t i = 0; i < lemma.len; i++) {
2871 0 0 if (lemma.str[i] == ' ') utf8::append(output, 0xA0);
2982 0 0 for (string line; getline(is, line); )
0 0 for (string line; getline(is, line); )
2983 0 0 whole.append(line).push_back('\n');
2985 0 0 if (is.eof() && !whole.empty()) is.clear(istream::eofbit);
0 0 if (is.eof() && !whole.empty()) is.clear(istream::eofbit);
0 0 if (is.eof() && !whole.empty()) is.clear(istream::eofbit);
3009 0 0 set_input(input);
3012 0 0 set_output(output);
3022 0 0 if (input.empty()) {
3024 0 0 } else if (input == "tokenize" || input == "tokenizer") {
3026 0 0 } else if (input.compare(0, 10, "tokenizer=") == 0) {
3043 0 0 this->output = output.empty() ? "conllu" : output;
3060 0 0 if (input == "tokenizer") {
3061 0 0 reader.reset(m->new_tokenizer(tokenizer));
3062 0 0 if (!reader) return error.assign("The model does not have a tokenizer!"), false;
0 0 if (!reader) return error.assign("The model does not have a tokenizer!"), false;
3064 0 0 reader.reset(input_format::new_input_format(input));
3065 0 0 if (!reader) return error.assign("The requested input format '").append(input).append("' does not exist!"), false;
0 0 if (!reader) return error.assign("The requested input format '").append(input).append("' does not exist!"), false;
0 0 if (!reader) return error.assign("The requested input format '").append(input).append("' does not exist!"), false;
3067 0 0 reader->reset_document(document_id);
3069 0 0 unique_ptr writer(output_format::new_output_format(output));
3070 0 0 if (!writer) return error.assign("The requested output format '").append(output).append("' does not exist!"), false;
0 0 if (!writer) return error.assign("The requested output format '").append(output).append("' does not exist!"), false;
0 0 if (!writer) return error.assign("The requested output format '").append(output).append("' does not exist!"), false;
3073 0 0 while (immediate ? reader->read_block(is, block) : bool(getwhole(is, block))) {
0 0 while (immediate ? reader->read_block(is, block) : bool(getwhole(is, block))) {
0 0 while (immediate ? reader->read_block(is, block) : bool(getwhole(is, block))) {
0 0 while (immediate ? reader->read_block(is, block) : bool(getwhole(is, block))) {
3074 0 0 reader->set_text(block);
3075 0 0 while (reader->next_sentence(s, error)) {
0 0 while (reader->next_sentence(s, error)) {
3076 0 0 if (tagger != NONE)
3077 0 0 if (!m->tag(s, tagger, error))
0 0 if (!m->tag(s, tagger, error))
3080 0 0 if (parser != NONE)
3081 0 0 if (!m->parse(s, parser, error))
0 0 if (!m->parse(s, parser, error))
3084 0 0 writer->write_sentence(s, os);
3086 0 0 if (!error.empty()) return false;
3088 0 0 writer->finish_document(os);
3198 0 0 format_tagged_lemma(result);
3203 0 0 for (auto&& lemma : lemmas)
3206 0 0 if (lemmas.size() > 1)
3214 0 0 if (converter) converter->convert(lemma);
3218 0 0 if (converter) converter->convert_analyzed(lemmas);
3231 0 0 for (derivated_lemma parent; derinet->parent(lemma.lemma, parent); )
0 0 for (derivated_lemma parent; derinet->parent(lemma.lemma, parent); )
3233 0 0 if (converter) converter->convert(lemma);
3241 0 0 return derinet ? new root_derivation_formatter(derinet) : nullptr;
0 0 return derinet ? new root_derivation_formatter(derinet) : nullptr;
3250 0 0 if (converter) converter->convert(lemma);
0 0 if (converter) converter->convert(lemma);
3251 0 0 for (derivated_lemma parent; derinet->parent(current.lemma, parent); current.lemma.swap(parent.lemma)) {
0 0 for (derivated_lemma parent; derinet->parent(current.lemma, parent); current.lemma.swap(parent.lemma)) {
3252 0 0 tagged_lemma parrent_lemma(parent.lemma, current.tag);
3253 0 0 if (converter) converter->convert(parrent_lemma);
0 0 if (converter) converter->convert(parrent_lemma);
3254 0 0 lemma.lemma.append(" ").append(parrent_lemma.lemma);
3263 0 0 return derinet ? new path_derivation_formatter(derinet) : nullptr;
0 0 return derinet ? new path_derivation_formatter(derinet) : nullptr;
3272 0 0 if (converter) converter->convert(lemma);
0 0 if (converter) converter->convert(lemma);
3273 0 0 for (derivated_lemma parent; derinet->parent(root, parent); root.swap(parent.lemma)) {}
0 0 for (derivated_lemma parent; derinet->parent(root, parent); root.swap(parent.lemma)) {}
3274 0 0 format_tree(root, tag, lemma, converter);
3280 0 0 if (converter) {
3281 0 0 tagged_lemma current(root, tag);
3282 0 0 converter->convert(current);
3283 0 0 tree.lemma.append(" ").append(current.lemma);
3285 0 0 tree.lemma.append(" ").append(root);
3288 0 0 if (derinet->children(root, children))
0 0 if (derinet->children(root, children))
3289 0 0 for (auto&& child : children)
3290 0 0 format_tree(child.lemma, tag, tree, converter);
3291 0 0 tree.lemma.push_back(' ');
3299 0 0 return derinet ? new tree_derivation_formatter(derinet) : nullptr;
0 0 return derinet ? new tree_derivation_formatter(derinet) : nullptr;
3303 0 0 if (name == "none") return new_none_derivation_formatter();
3304 0 0 if (name == "root") return new_root_derivation_formatter(derinet);
3305 0 0 if (name == "path") return new_path_derivation_formatter(derinet);
3306 0 0 if (name == "tree") return new_tree_derivation_formatter(derinet);
3336 0 0 while (len--)
0 0 while (len--)
0 0 while (len--)
0 0 while (len--)
0 0 while (len--)
0 0 while (len--)
0 0 while (len--)
0 0 while (len--)
0 0 while (len--)
0 0 while (len--)
529 133 while (len--)
0 0 while (len--)
0 0 while (len--)
0 0 while (len--)
0 0 while (len--)
133 36 while (len--)
1009 66 while (len--)
0 0 while (len--)
64 10 while (len--)
0 0 while (len--)
0 0 while (len--)
0 0 while (len--)
0 0 while (len--)
0 0 while (len--)
0 0 while (len--)
0 0 while (len--)
3337 0 0 if (*a++ != *b++)
0 0 if (*a++ != *b++)
0 0 if (*a++ != *b++)
0 0 if (*a++ != *b++)
0 0 if (*a++ != *b++)
0 0 if (*a++ != *b++)
0 0 if (*a++ != *b++)
0 0 if (*a++ != *b++)
0 0 if (*a++ != *b++)
0 0 if (*a++ != *b++)
307 222 if (*a++ != *b++)
0 0 if (*a++ != *b++)
0 0 if (*a++ != *b++)
0 0 if (*a++ != *b++)
0 0 if (*a++ != *b++)
111 22 if (*a++ != *b++)
1000 9 if (*a++ != *b++)
0 0 if (*a++ != *b++)
61 3 if (*a++ != *b++)
0 0 if (*a++ != *b++)
0 0 if (*a++ != *b++)
0 0 if (*a++ != *b++)
0 0 if (*a++ != *b++)
0 0 if (*a++ != *b++)
0 0 if (*a++ != *b++)
0 0 if (*a++ != *b++)
3346 0 0 while (len--)
0 0 while (len--)
0 0 while (len--)
0 0 while (len--)
0 0 while (len--)
729 346 while (len--)
0 0 while (len--)
0 0 while (len--)
0 0 while (len--)
20 118 while (len--)
0 0 while (len--)
0 0 while (len--)
0 0 while (len--)
0 0 while (len--)
0 0 while (len--)
120 20 while (len--)
3419 0 0 data.reserve(16);
3423 0 0 if (uint8_t(val) != val) training_failure("Should encode value " << val << " in one byte!");
0 0 if (uint8_t(val) != val) training_failure("Should encode value " << val << " in one byte!");
0 0 if (uint8_t(val) != val) training_failure("Should encode value " << val << " in one byte!");
3428 0 0 if (uint16_t(val) != val) training_failure("Should encode value " << val << " in two bytes!");
0 0 if (uint16_t(val) != val) training_failure("Should encode value " << val << " in two bytes!");
0 0 if (uint16_t(val) != val) training_failure("Should encode value " << val << " in two bytes!");
3447 0 0 if (!(str.len < 255)) add_4B(str.len);
3606 30 10 while (size) {
0 0 while (size) {
0 0 while (size) {
3608 21 9 if (unaligned_load(first + step) < val) {
0 0 if (unaligned_load(first + step) < val) {
0 0 if (unaligned_load(first + step) < val) {
3650 0 0 class persistent_unordered_map {
0 0 class persistent_unordered_map {
3696 0 0 struct persistent_unordered_map::fnv_hash {
3699 52 24 while (mask < num)
3701 24 0 hash.resize(mask + 1);
3705 484 0 uint32_t size = data.next_4B();
3707 484 0 hash.resize(size);
3708 484 0 memcpy(hash.data(), data.next(size), size * sizeof(uint32_t));
3710 484 0 size = data.next_4B();
3711 484 0 this->data.resize(size);
3712 145 339 if (size) memcpy(this->data.data(), data.next(size), size);
145 0 if (size) memcpy(this->data.data(), data.next(size), size);
3716 0 0 if (len <= 0) return 0;
0 0 if (len <= 0) return 0;
0 0 if (len <= 0) return 0;
0 0 if (len <= 0) return 0;
0 0 if (len <= 0) return 0;
0 0 if (len <= 0) return 0;
0 0 if (len <= 0) return 0;
0 0 if (len <= 0) return 0;
0 0 if (len <= 0) return 0;
8 0 if (len <= 0) return 0;
0 8 if (len <= 0) return 0;
0 0 if (len <= 0) return 0;
0 0 if (len <= 0) return 0;
0 0 if (len <= 0) return 0;
0 0 if (len <= 0) return 0;
0 0 if (len <= 0) return 0;
0 0 if (len <= 0) return 0;
330 0 if (len <= 0) return 0;
78 0 if (len <= 0) return 0;
20 0 if (len <= 0) return 0;
20 0 if (len <= 0) return 0;
3717 0 0 if (len == 1) return unaligned_load(data);
0 0 if (len == 1) return unaligned_load(data);
0 0 if (len == 1) return unaligned_load(data);
0 0 if (len == 1) return unaligned_load(data);
0 0 if (len == 1) return unaligned_load(data);
0 0 if (len == 1) return unaligned_load(data);
0 0 if (len == 1) return unaligned_load(data);
0 0 if (len == 1) return unaligned_load(data);
0 0 if (len == 1) return unaligned_load(data);
2 6 if (len == 1) return unaligned_load(data);
0 0 if (len == 1) return unaligned_load(data);
0 0 if (len == 1) return unaligned_load(data);
0 0 if (len == 1) return unaligned_load(data);
0 0 if (len == 1) return unaligned_load(data);
0 0 if (len == 1) return unaligned_load(data);
0 0 if (len == 1) return unaligned_load(data);
0 0 if (len == 1) return unaligned_load(data);
13 317 if (len == 1) return unaligned_load(data);
6 72 if (len == 1) return unaligned_load(data);
4 16 if (len == 1) return unaligned_load(data);
4 16 if (len == 1) return unaligned_load(data);
3718 0 0 if (len == 2) return unaligned_load(data);
0 0 if (len == 2) return unaligned_load(data);
0 0 if (len == 2) return unaligned_load(data);
0 0 if (len == 2) return unaligned_load(data);
0 0 if (len == 2) return unaligned_load(data);
0 0 if (len == 2) return unaligned_load(data);
0 0 if (len == 2) return unaligned_load(data);
0 0 if (len == 2) return unaligned_load(data);
0 0 if (len == 2) return unaligned_load(data);
5 1 if (len == 2) return unaligned_load(data);
0 0 if (len == 2) return unaligned_load(data);
0 0 if (len == 2) return unaligned_load(data);
0 0 if (len == 2) return unaligned_load(data);
0 0 if (len == 2) return unaligned_load(data);
0 0 if (len == 2) return unaligned_load(data);
0 0 if (len == 2) return unaligned_load(data);
0 0 if (len == 2) return unaligned_load(data);
48 269 if (len == 2) return unaligned_load(data);
67 5 if (len == 2) return unaligned_load(data);
15 1 if (len == 2) return unaligned_load(data);
15 1 if (len == 2) return unaligned_load(data);
3721 0 0 while (len--)
0 0 while (len--)
0 0 while (len--)
0 0 while (len--)
0 0 while (len--)
0 0 while (len--)
0 0 while (len--)
0 0 while (len--)
0 0 while (len--)
38 5 while (len--)
0 0 while (len--)
0 0 while (len--)
0 0 while (len--)
0 0 while (len--)
0 0 while (len--)
0 0 while (len--)
0 0 while (len--)
144 48 while (len--)
1003 67 while (len--)
114 15 while (len--)
114 15 while (len--)
3735 0 0 if (unsigned(len) >= hashes.size()) return nullptr;
0 0 if (unsigned(len) >= hashes.size()) return nullptr;
0 0 if (unsigned(len) >= hashes.size()) return nullptr;
0 0 if (unsigned(len) >= hashes.size()) return nullptr;
0 0 if (unsigned(len) >= hashes.size()) return nullptr;
0 0 if (unsigned(len) >= hashes.size()) return nullptr;
0 0 if (unsigned(len) >= hashes.size()) return nullptr;
0 0 if (unsigned(len) >= hashes.size()) return nullptr;
8 0 if (unsigned(len) >= hashes.size()) return nullptr;
0 0 if (unsigned(len) >= hashes.size()) return nullptr;
0 0 if (unsigned(len) >= hashes.size()) return nullptr;
3741 0 0 if (len <= 2)
0 0 if (len <= 2)
0 0 if (len <= 2)
0 0 if (len <= 2)
0 0 if (len <= 2)
0 0 if (len <= 2)
0 0 if (len <= 2)
0 0 if (len <= 2)
0 8 if (len <= 2)
0 0 if (len <= 2)
0 0 if (len <= 2)
3742 0 0 return data != end ? data + len : nullptr;
0 0 return data != end ? data + len : nullptr;
0 0 return data != end ? data + len : nullptr;
0 0 return data != end ? data + len : nullptr;
0 0 return data != end ? data + len : nullptr;
0 0 return data != end ? data + len : nullptr;
0 0 return data != end ? data + len : nullptr;
0 0 return data != end ? data + len : nullptr;
8 0 return data != end ? data + len : nullptr;
0 0 return data != end ? data + len : nullptr;
0 0 return data != end ? data + len : nullptr;
3744 0 0 while (data < end) {
0 0 while (data < end) {
0 0 while (data < end) {
0 0 while (data < end) {
0 0 while (data < end) {
0 0 while (data < end) {
0 0 while (data < end) {
0 0 while (data < end) {
0 0 while (data < end) {
0 0 while (data < end) {
0 0 while (data < end) {
3745 0 0 if (small_memeq(str, data, len)) return data + len;
0 0 if (small_memeq(str, data, len)) return data + len;
0 0 if (small_memeq(str, data, len)) return data + len;
0 0 if (small_memeq(str, data, len)) return data + len;
0 0 if (small_memeq(str, data, len)) return data + len;
0 0 if (small_memeq(str, data, len)) return data + len;
0 0 if (small_memeq(str, data, len)) return data + len;
0 0 if (small_memeq(str, data, len)) return data + len;
0 0 if (small_memeq(str, data, len)) return data + len;
0 0 if (small_memeq(str, data, len)) return data + len;
0 0 if (small_memeq(str, data, len)) return data + len;
3756 330 16 if (unsigned(len) >= hashes.size()) return nullptr;
78 14 if (unsigned(len) >= hashes.size()) return nullptr;
3762 48 282 if (len <= 2)
67 11 if (len <= 2)
3763 234 48 return data != end ? (const T*)(data + len) : nullptr;
10 1 return data != end ? (const T*)(data + len) : nullptr;
3765 58 12 while (data < end) {
75 1 while (data < end) {
3766 36 22 if (small_memeq(str, data, len)) return (const T*)(data + len);
66 9 if (small_memeq(str, data, len)) return (const T*)(data + len);
3775 0 0 if (unsigned(len) >= hashes.size()) return;
8 0 if (unsigned(len) >= hashes.size()) return;
0 0 if (unsigned(len) >= hashes.size()) return;
0 0 if (unsigned(len) >= hashes.size()) return;
0 0 if (unsigned(len) >= hashes.size()) return;
0 0 if (unsigned(len) >= hashes.size()) return;
3781 0 0 while (data < end) {
13 8 while (data < end) {
0 0 while (data < end) {
0 0 while (data < end) {
0 0 while (data < end) {
0 0 while (data < end) {
3791 1 1 for (unsigned len = 0; len < hashes.size(); len++) {
0 0 for (unsigned len = 0; len < hashes.size(); len++) {
0 0 for (unsigned len = 0; len < hashes.size(); len++) {
3795 1 1 while (data < end) {
0 0 while (data < end) {
0 0 while (data < end) {
3809 0 0 return unsigned(len) < hashes.size() ? hashes[len].data.data() : nullptr;
0 0 return unsigned(len) < hashes.size() ? hashes[len].data.data() : nullptr;
0 0 return unsigned(len) < hashes.size() ? hashes[len].data.data() : nullptr;
0 0 return unsigned(len) < hashes.size() ? hashes[len].data.data() : nullptr;
0 0 return unsigned(len) < hashes.size() ? hashes[len].data.data() : nullptr;
0 0 return unsigned(len) < hashes.size() ? hashes[len].data.data() : nullptr;
0 0 return unsigned(len) < hashes.size() ? hashes[len].data.data() : nullptr;
3813 2 22 if (hashes.size() == 0) hashes.emplace_back(1);
3814 2 20 else if (hashes.size() == 1) hashes.emplace_back(1<<8);
3815 2 18 else if (hashes.size() == 2) hashes.emplace_back(1<<16);
3820 20 0 if (unsigned(str_len) < hashes.size())
3825 24 2 for (auto&& hash : hashes) {
3827 131633 24 for (auto&& len : hash.hash) total += len, len = total - len;
3833 20 0 if (unsigned(str_len) < hashes.size()) {
3844 24 2 for (auto&& hash : hashes)
3845 131633 24 for (int i = hash.hash.size() - 1; i >= 0; i--)
3846 131609 24 hash.hash[i] = i > 0 ? hash.hash[i-1] : 0;
3853 484 103 for (unsigned i = 0; i < sizes; i++)
3931 0 0 if (dictionary) lemma.len = dictionary->lemma_id_len(lemma);
3938 0 0 if (lemma_data) {
3940 0 0 if (parent_encoded) {
3944 0 0 if (parent_data[parent_len])
3954 0 0 if (dictionary) lemma.len = dictionary->lemma_id_len(lemma);
3961 0 0 if (lemma_data) {
3964 0 0 if (children_len) {
3966 0 0 for (unsigned i = 0; i < children_len; i++) {
3970 0 0 if (child_data[child_len])
3982 0 0 if (!compressor::load(is, data)) return false;
0 0 if (!compressor::load(is, data)) return false;
3985 0 0 for (int i = data.next_1B(); i > 0; i--)
0 0 for (int i = data.next_1B(); i > 0; i--)
3986 0 0 derinet.resize(data.next_4B());
0 0 derinet.resize(data.next_4B());
3990 0 0 for (int pass = 1; pass <= 3; pass++) {
3991 0 0 if (pass > 1) data.seek(data_position);
0 0 if (pass > 1) data.seek(data_position);
3994 0 0 for (int i = data.next_4B(); i > 0; i--) {
0 0 for (int i = data.next_4B(); i > 0; i--) {
3995 0 0 lemma.resize(lemma.size() - data.next_1B());
0 0 lemma.resize(lemma.size() - data.next_1B());
3996 0 0 for (int i = data.next_1B(); i > 0; i--)
0 0 for (int i = data.next_1B(); i > 0; i--)
3997 0 0 lemma.push_back(data.next_1B());
3999 0 0 unsigned char lemma_comment_len = data.next_1B();
4000 0 0 const char* lemma_comment = lemma_comment_len ? data.next(lemma_comment_len) : nullptr;
0 0 const char* lemma_comment = lemma_comment_len ? data.next(lemma_comment_len) : nullptr;
4002 0 0 unsigned children = data.next_2B();
4004 0 0 if (pass == 3) parent.clear();
4006 0 0 int operations = data.next_1B();
4007 0 0 if (operations) {
4008 0 0 int remove_start = operations & REMOVE_START ? data.next_1B() : 0;
0 0 int remove_start = operations & REMOVE_START ? data.next_1B() : 0;
4009 0 0 int remove_end = operations & REMOVE_END ? data.next_1B() : 0;
0 0 int remove_end = operations & REMOVE_END ? data.next_1B() : 0;
4010 0 0 if (operations & ADD_START) {
4011 0 0 int add_start = data.next_1B();
4012 0 0 const char* str = data.next(add_start);
4013 0 0 if (pass == 3) parent.assign(str, str + add_start);
4015 0 0 if (pass == 3) parent.insert(parent.end(), lemma.begin() + remove_start, lemma.end() - remove_end);
0 0 if (pass == 3) parent.insert(parent.end(), lemma.begin() + remove_start, lemma.end() - remove_end);
4016 0 0 if (operations & ADD_END) {
4017 0 0 int add_end = data.next_1B();
4018 0 0 const char* str = data.next(add_end);
4019 0 0 if (pass == 3) parent.insert(parent.end(), str, str + add_end);
4023 0 0 if (pass == 1) {
4025 0 0 } else if (pass == 2) {
4028 0 0 while (lemma_comment_len--) *lemma_data++ = *lemma_comment++;
4031 0 0 if (children) unaligned_store(((uint32_t*)lemma_data) + children - 1, 0);
4032 0 0 } else if (pass == 3 && !parent.empty()) {
0 0 } else if (pass == 3 && !parent.empty()) {
0 0 } else if (pass == 3 && !parent.empty()) {
4043 0 0 assert(lemma_data && parent_data);
4046 0 0 assert(parent.size() < (1<<8) && parent_offset < (1<<24));
0 0 assert(parent.size() < (1<<8) && parent_offset < (1<<24));
4050 0 0 assert(lemma.size() < (1<<8) && lemma_offset < (1<<24));
0 0 assert(lemma.size() < (1<<8) && lemma_offset < (1<<24));
4055 0 0 if (child_index+1 < children_len)
4060 0 0 if (pass == 1)
4061 0 0 derinet.done_adding();
4062 0 0 if (pass == 2)
4064 0 0 }
4097 22 7 while (form_tmp.len && !rest_has_Lut)
22 0 while (form_tmp.len && !rest_has_Lut)
4106 1 6 if (first_Lut && !rest_has_Lut) { // common case allowing fast execution
4111 0 6 } else if (!first_Lut && rest_has_Lut) {
4114 0 6 } else if (first_Lut && rest_has_Lut) {
4121 0 0 while (form_tmp.len) {
4162 0 0 for (unsigned len = 1; len < lemma.len; len++)
4163 0 0 if (lemma.str[len] == '`' || lemma.str[len] == '_' ||
0 0 if (lemma.str[len] == '`' || lemma.str[len] == '_' ||
4164 0 0 (lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9'))
0 0 (lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9'))
0 0 (lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9'))
4171 0 0 for (unsigned len = 1; len < lemma.len; len++) {
4172 0 0 if (lemma.str[len] == '`' || lemma.str[len] == '_')
4174 0 0 if (lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9') {
0 0 if (lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9') {
0 0 if (lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9') {
0 0 if (lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9') {
4176 0 0 while (len < lemma.len && lemma.str[len] >= '0' && lemma.str[len] <= '9') len++;
0 0 while (len < lemma.len && lemma.str[len] >= '0' && lemma.str[len] <= '9') len++;
0 0 while (len < lemma.len && lemma.str[len] >= '0' && lemma.str[len] <= '9') len++;
4186 0 0 if (addinfo_len) {
4187 0 0 res.reserve(addinfo_len + 4);
4188 0 0 if (addinfo[0] != 255) {
4193 0 0 for (int i = 1; i < addinfo_len; i++)
4201 0 0 for (int i = 1; i + 2 < addinfo_len; i++)
4202 0 0 if (addinfo[i] == '_' && addinfo[i+1] == ',' && addinfo[i+2] == 'x')
0 0 if (addinfo[i] == '_' && addinfo[i+1] == ',' && addinfo[i+2] == 'x')
0 0 if (addinfo[i] == '_' && addinfo[i+1] == ',' && addinfo[i+2] == 'x')
4212 0 0 if (lemma_info < lemma.str + lemma.len) {
4216 0 0 if (*lemma_info == '-') {
4219 0 0 lemma_additional_info < lemma.str + lemma.len && (*lemma_additional_info >= '0' && *lemma_additional_info <= '9');
0 0 lemma_additional_info < lemma.str + lemma.len && (*lemma_additional_info >= '0' && *lemma_additional_info <= '9');
4223 0 0 if (lemma_additional_info == lemma_info + 1 || (lemma_additional_info < lemma.str + lemma.len && *lemma_additional_info != '`' && *lemma_additional_info != '_') || lemma_num >= 255) {
0 0 if (lemma_additional_info == lemma_info + 1 || (lemma_additional_info < lemma.str + lemma.len && *lemma_additional_info != '`' && *lemma_additional_info != '_') || lemma_num >= 255) {
0 0 if (lemma_additional_info == lemma_info + 1 || (lemma_additional_info < lemma.str + lemma.len && *lemma_additional_info != '`' && *lemma_additional_info != '_') || lemma_num >= 255) {
0 0 if (lemma_additional_info == lemma_info + 1 || (lemma_additional_info < lemma.str + lemma.len && *lemma_additional_info != '`' && *lemma_additional_info != '_') || lemma_num >= 255) {
0 0 if (lemma_additional_info == lemma_info + 1 || (lemma_additional_info < lemma.str + lemma.len && *lemma_additional_info != '`' && *lemma_additional_info != '_') || lemma_num >= 255) {
4224 0 0 if (die_on_failure)
4225 0 0 training_failure("Lemma number " << lemma_num << " in lemma " << lemma << " out of range!");
0 0 training_failure("Lemma number " << lemma_num << " in lemma " << lemma << " out of range!");
0 0 training_failure("Lemma number " << lemma_num << " in lemma " << lemma << " out of range!");
4231 0 0 while (lemma_additional_info < lemma.str + lemma.len)
4234 0 0 if (data.size() > 255) {
4235 0 0 if (die_on_failure)
4236 0 0 training_failure("Too long lemma info " << lemma_info << " in lemma " << lemma << '!');
0 0 training_failure("Too long lemma info " << lemma_info << " in lemma " << lemma << '!');
4246 0 0 if (data.empty()) return true;
4247 0 0 if (data[0] != 255 && (!other_addinfo_len || other_addinfo[0] != data[0])) return false;
0 0 if (data[0] != 255 && (!other_addinfo_len || other_addinfo[0] != data[0])) return false;
0 0 if (data[0] != 255 && (!other_addinfo_len || other_addinfo[0] != data[0])) return false;
0 0 if (data[0] != 255 && (!other_addinfo_len || other_addinfo[0] != data[0])) return false;
4291 0 0 if (filters.empty()) return true;
4294 0 0 for (auto&& filter : filters) {
4296 0 0 while (tag_pos < filter.pos)
4297 0 0 if (!tag[tag_pos++])
4299 0 0 if (!tag[tag_pos])
4304 0 0 for (int i = 1; i < filter.chars_len && ((!matched) ^ filter.negate); i++)
0 0 for (int i = 1; i < filter.chars_len && ((!matched) ^ filter.negate); i++)
4306 0 0 if (!matched) return false;
4346 12 1 for (int i = data.next_1B(); i > 0; i--)
0 0 for (int i = data.next_1B(); i > 0; i--)
0 0 for (int i = data.next_1B(); i > 0; i--)
4348 12 1 for (int i = data.next_1B(); i > 0; i--)
0 0 for (int i = data.next_1B(); i > 0; i--)
0 0 for (int i = data.next_1B(); i > 0; i--)
4354 1 0 vector root(max(lemmas.max_length(), roots.max_length()));
0 0 vector root(max(lemmas.max_length(), roots.max_length()));
0 0 vector root(max(lemmas.max_length(), roots.max_length()));
4356 2 1 for (int pass = 1; pass <= 2; pass++) {
0 0 for (int pass = 1; pass <= 2; pass++) {
0 0 for (int pass = 1; pass <= 2; pass++) {
4357 1 1 if (pass > 1) data.seek(data_position);
1 0 if (pass > 1) data.seek(data_position);
0 0 if (pass > 1) data.seek(data_position);
0 0 if (pass > 1) data.seek(data_position);
0 0 if (pass > 1) data.seek(data_position);
0 0 if (pass > 1) data.seek(data_position);
4362 2 0 for (int i = data.next_4B(); i > 0; i--) {
20 2 for (int i = data.next_4B(); i > 0; i--) {
0 0 for (int i = data.next_4B(); i > 0; i--) {
0 0 for (int i = data.next_4B(); i > 0; i--) {
0 0 for (int i = data.next_4B(); i > 0; i--) {
0 0 for (int i = data.next_4B(); i > 0; i--) {
4363 20 0 lemma_len -= data.next_1B();
0 0 lemma_len -= data.next_1B();
0 0 lemma_len -= data.next_1B();
4364 20 0 for (int i = data.next_1B(); i > 0; i--)
106 20 for (int i = data.next_1B(); i > 0; i--)
0 0 for (int i = data.next_1B(); i > 0; i--)
0 0 for (int i = data.next_1B(); i > 0; i--)
0 0 for (int i = data.next_1B(); i > 0; i--)
0 0 for (int i = data.next_1B(); i > 0; i--)
4365 106 0 lemma[lemma_len++] = data.next_1B();
0 0 lemma[lemma_len++] = data.next_1B();
0 0 lemma[lemma_len++] = data.next_1B();
4366 20 0 unsigned char lemma_info_len = data.next_1B();
0 0 unsigned char lemma_info_len = data.next_1B();
0 0 unsigned char lemma_info_len = data.next_1B();
4367 0 20 const char* lemma_info = lemma_info_len ? data.next(lemma_info_len) : nullptr;
0 0 const char* lemma_info = lemma_info_len ? data.next(lemma_info_len) : nullptr;
0 0 const char* lemma_info = lemma_info_len ? data.next(lemma_info_len) : nullptr;
0 0 const char* lemma_info = lemma_info_len ? data.next(lemma_info_len) : nullptr;
0 0 const char* lemma_info = lemma_info_len ? data.next(lemma_info_len) : nullptr;
0 0 const char* lemma_info = lemma_info_len ? data.next(lemma_info_len) : nullptr;
4368 20 0 unsigned lemma_roots = data.next_1B();
0 0 unsigned lemma_roots = data.next_1B();
0 0 unsigned lemma_roots = data.next_1B();
4373 10 10 if (pass == 1) {
0 0 if (pass == 1) {
0 0 if (pass == 1) {
4380 0 10 if (lemma_info_len) small_memcpy(lemma_data, lemma_info, lemma_info_len), lemma_data += lemma_info_len;
0 0 if (lemma_info_len) small_memcpy(lemma_data, lemma_info, lemma_info_len), lemma_data += lemma_info_len;
0 0 if (lemma_info_len) small_memcpy(lemma_data, lemma_info, lemma_info_len), lemma_data += lemma_info_len;
4385 20 20 for (unsigned i = 0; i < lemma_roots; i++) {
0 0 for (unsigned i = 0; i < lemma_roots; i++) {
0 0 for (unsigned i = 0; i < lemma_roots; i++) {
4387 20 0 int operations = data.next_1B();
0 0 int operations = data.next_1B();
0 0 int operations = data.next_1B();
4388 4 16 if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; }
4 0 if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; }
28 4 if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; }
0 0 if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; }
0 0 if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; }
0 0 if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; }
0 0 if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; }
0 0 if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; }
0 0 if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; }
4389 12 8 if (operations & REMOVE_END) root_len -= data.next_1B();
12 0 if (operations & REMOVE_END) root_len -= data.next_1B();
0 0 if (operations & REMOVE_END) root_len -= data.next_1B();
0 0 if (operations & REMOVE_END) root_len -= data.next_1B();
0 0 if (operations & REMOVE_END) root_len -= data.next_1B();
0 0 if (operations & REMOVE_END) root_len -= data.next_1B();
4390 6 14 if (operations & ADD_START) {
0 0 if (operations & ADD_START) {
0 0 if (operations & ADD_START) {
4391 6 0 int from = root_len, to = from + data.next_1B(); while (from > 0) root[--to] = root[--from]; root_len += to;
38 6 int from = root_len, to = from + data.next_1B(); while (from > 0) root[--to] = root[--from]; root_len += to;
0 0 int from = root_len, to = from + data.next_1B(); while (from > 0) root[--to] = root[--from]; root_len += to;
0 0 int from = root_len, to = from + data.next_1B(); while (from > 0) root[--to] = root[--from]; root_len += to;
0 0 int from = root_len, to = from + data.next_1B(); while (from > 0) root[--to] = root[--from]; root_len += to;
0 0 int from = root_len, to = from + data.next_1B(); while (from > 0) root[--to] = root[--from]; root_len += to;
4392 8 6 for (int i = 0; i < to; i++) root[i] = data.next_1B();
8 0 for (int i = 0; i < to; i++) root[i] = data.next_1B();
0 0 for (int i = 0; i < to; i++) root[i] = data.next_1B();
0 0 for (int i = 0; i < to; i++) root[i] = data.next_1B();
0 0 for (int i = 0; i < to; i++) root[i] = data.next_1B();
0 0 for (int i = 0; i < to; i++) root[i] = data.next_1B();
4394 12 8 if (operations & ADD_END)
0 0 if (operations & ADD_END)
0 0 if (operations & ADD_END)
4395 12 0 for (int len = data.next_1B(); len > 0; len--)
22 12 for (int len = data.next_1B(); len > 0; len--)
0 0 for (int len = data.next_1B(); len > 0; len--)
0 0 for (int len = data.next_1B(); len > 0; len--)
0 0 for (int len = data.next_1B(); len > 0; len--)
0 0 for (int len = data.next_1B(); len > 0; len--)
4396 22 0 root[root_len++] = data.next_1B();
0 0 root[root_len++] = data.next_1B();
0 0 root[root_len++] = data.next_1B();
4397 20 0 uint16_t clas = data.next_2B();
0 0 uint16_t clas = data.next_2B();
0 0 uint16_t clas = data.next_2B();
4399 10 10 if (pass == 1) { // for each root
0 0 if (pass == 1) { // for each root
0 0 if (pass == 1) { // for each root
4408 0 10 assert(uint8_t(lemma_len) == lemma_len);
0 0 assert(uint8_t(lemma_len) == lemma_len);
0 0 assert(uint8_t(lemma_len) == lemma_len);
4413 0 10 assert(uint8_t(root_len) == root_len);
0 0 assert(uint8_t(root_len) == root_len);
0 0 assert(uint8_t(root_len) == root_len);
4418 1 1 if (pass == 1) { // after the whole pass
0 0 if (pass == 1) { // after the whole pass
0 0 if (pass == 1) { // after the whole pass
4419 1 0 lemmas.done_adding();
0 0 lemmas.done_adding();
0 0 lemmas.done_adding();
4420 1 0 roots.done_adding();
0 0 roots.done_adding();
0 0 roots.done_adding();
4428 1 0 tags.resize(data.next_2B());
1 0 tags.resize(data.next_2B());
0 0 tags.resize(data.next_2B());
0 0 tags.resize(data.next_2B());
0 0 tags.resize(data.next_2B());
0 0 tags.resize(data.next_2B());
4429 6 1 for (auto&& tag : tags) {
0 0 for (auto&& tag : tags) {
0 0 for (auto&& tag : tags) {
4430 6 0 tag.resize(data.next_1B());
0 0 tag.resize(data.next_1B());
0 0 tag.resize(data.next_1B());
4431 397 6 for (unsigned i = 0; i < tag.size(); i++)
0 0 for (unsigned i = 0; i < tag.size(); i++)
0 0 for (unsigned i = 0; i < tag.size(); i++)
4432 397 0 tag[i] = data.next_1B();
0 0 tag[i] = data.next_1B();
0 0 tag[i] = data.next_1B();
4436 1 0 suffixes.load(data);
0 0 suffixes.load(data);
0 0 suffixes.load(data);
4439 1 0 suffixes.iter_all([this](const char* suffix, int len, pointer_decoder& data) mutable {
0 0 suffixes.iter_all([this](const char* suffix, int len, pointer_decoder& data) mutable {
0 0 suffixes.iter_all([this](const char* suffix, int len, pointer_decoder& data) mutable {
4444 6 1 for (unsigned i = 0; i < classes_len; i++)
0 0 for (unsigned i = 0; i < classes_len; i++)
0 0 for (unsigned i = 0; i < classes_len; i++)
4450 6 1 for (unsigned i = 0; i < classes_len; i++) {
0 0 for (unsigned i = 0; i < classes_len; i++) {
0 0 for (unsigned i = 0; i < classes_len; i++) {
4452 6 0 if (classes_ptr_i >= classes.size()) classes.resize(classes_ptr_i + 1);
6 0 if (classes_ptr_i >= classes.size()) classes.resize(classes_ptr_i + 1);
0 0 if (classes_ptr_i >= classes.size()) classes.resize(classes_ptr_i + 1);
0 0 if (classes_ptr_i >= classes.size()) classes.resize(classes_ptr_i + 1);
0 0 if (classes_ptr_i >= classes.size()) classes.resize(classes_ptr_i + 1);
0 0 if (classes_ptr_i >= classes.size()) classes.resize(classes_ptr_i + 1);
4455 6 0 classes[classes_ptr_i].emplace_back(suffix_str, vector());
0 0 classes[classes_ptr_i].emplace_back(suffix_str, vector());
0 0 classes[classes_ptr_i].emplace_back(suffix_str, vector());
4456 6 6 for (const uint16_t* ptr = tags_ptr + prev_index; ptr < tags_ptr + index; ptr++)
0 0 for (const uint16_t* ptr = tags_ptr + prev_index; ptr < tags_ptr + index; ptr++)
0 0 for (const uint16_t* ptr = tags_ptr + prev_index; ptr < tags_ptr + index; ptr++)
4457 6 0 classes[classes_ptr_i].back().second.emplace_back(unaligned_load(ptr));
0 0 classes[classes_ptr_i].back().second.emplace_back(unaligned_load(ptr));
0 0 classes[classes_ptr_i].back().second.emplace_back(unaligned_load(ptr));
4467 0 8 uint16_t** suff = max_suffix_len <= 16 ? suff_stack : (suff_heap.resize(max_suffix_len), suff_heap.data());
0 0 uint16_t** suff = max_suffix_len <= 16 ? suff_stack : (suff_heap.resize(max_suffix_len), suff_heap.data());
0 0 uint16_t** suff = max_suffix_len <= 16 ? suff_stack : (suff_heap.resize(max_suffix_len), suff_heap.data());
0 0 uint16_t** suff = max_suffix_len <= 16 ? suff_stack : (suff_heap.resize(max_suffix_len), suff_heap.data());
0 0 uint16_t** suff = max_suffix_len <= 16 ? suff_stack : (suff_heap.resize(max_suffix_len), suff_heap.data());
0 0 uint16_t** suff = max_suffix_len <= 16 ? suff_stack : (suff_heap.resize(max_suffix_len), suff_heap.data());
4469 8 8 for (int i = form.len; i >= 0 && suff_len < max_suffix_len; i--, suff_len++) {
0 0 for (int i = form.len; i >= 0 && suff_len < max_suffix_len; i--, suff_len++) {
0 0 for (int i = form.len; i >= 0 && suff_len < max_suffix_len; i--, suff_len++) {
4477 8 8 for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++)
0 8 for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++)
8 8 for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++)
0 0 for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++)
0 0 for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++)
0 0 for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++)
0 0 for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++)
0 0 for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++)
0 0 for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++)
4478 8 0 if (unaligned_load(suff[suff_len])) {
0 0 if (unaligned_load(suff[suff_len])) {
0 0 if (unaligned_load(suff[suff_len])) {
4482 8 0 roots.iter(form.str, root_len, [&](const char* root, pointer_decoder& root_data) {
0 0 roots.iter(form.str, root_len, [&](const char* root, pointer_decoder& root_data) {
0 0 roots.iter(form.str, root_len, [&](const char* root, pointer_decoder& root_data) {
4487 10 3 if (small_memeq(form.str, root, root_len)) {
0 0 if (small_memeq(form.str, root, root_len)) {
0 0 if (small_memeq(form.str, root, root_len)) {
4489 10 0 if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) {
0 10 if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) {
10 0 if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) {
0 0 if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) {
0 0 if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) {
0 0 if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) {
0 0 if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) {
0 0 if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) {
0 0 if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) {
4492 0 10 if (lemma_data[lemma_len]) lemma += LemmaAddinfo::format(lemma_data + lemma_len + 1, lemma_data[lemma_len]);
0 0 if (lemma_data[lemma_len]) lemma += LemmaAddinfo::format(lemma_data + lemma_len + 1, lemma_data[lemma_len]);
0 0 if (lemma_data[lemma_len]) lemma += LemmaAddinfo::format(lemma_data + lemma_len + 1, lemma_data[lemma_len]);
0 0 if (lemma_data[lemma_len]) lemma += LemmaAddinfo::format(lemma_data + lemma_len + 1, lemma_data[lemma_len]);
4496 10 10 for (unsigned i = unaligned_load(suff_tag_indices + (suffix_class_ptr - suff_data));
0 0 for (unsigned i = unaligned_load(suff_tag_indices + (suffix_class_ptr - suff_data));
0 0 for (unsigned i = unaligned_load(suff_tag_indices + (suffix_class_ptr - suff_data));
4498 10 0 lemmas.emplace_back(lemma, tags[unaligned_load(suff_tags + i)]);
0 0 lemmas.emplace_back(lemma, tags[unaligned_load(suff_tags + i)]);
0 0 lemmas.emplace_back(lemma, tags[unaligned_load(suff_tags + i)]);
4508 0 0 int raw_lemma_len = addinfo.parse(lemma);
0 0 int raw_lemma_len = addinfo.parse(lemma);
4511 0 0 lemmas.iter(lemma.str, raw_lemma_len, [&](const char* lemma_str, pointer_decoder& data) {
0 0 lemmas.iter(lemma.str, raw_lemma_len, [&](const char* lemma_str, pointer_decoder& data) {
0 0 lemmas.iter(lemma.str, raw_lemma_len, [&](const char* lemma_str, pointer_decoder& data) {
4517 0 0 if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) {
0 0 if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) {
0 0 if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) {
0 0 if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) {
0 0 if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) {
0 0 if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) {
0 0 if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) {
0 0 if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) {
0 0 if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) {
4522 0 0 for (unsigned i = 0; i < lemma_roots_len; i++) {
0 0 for (unsigned i = 0; i < lemma_roots_len; i++) {
0 0 for (unsigned i = 0; i < lemma_roots_len; i++) {
4528 0 0 for (auto&& suffix : classes[clas]) {
0 0 for (auto&& suffix : classes[clas]) {
0 0 for (auto&& suffix : classes[clas]) {
4530 0 0 for (auto&& tag : suffix.second)
0 0 for (auto&& tag : suffix.second)
0 0 for (auto&& tag : suffix.second)
4531 0 0 if (filter.matches(tags[tag].c_str())) {
0 0 if (filter.matches(tags[tag].c_str())) {
0 0 if (filter.matches(tags[tag].c_str())) {
4532 0 0 if (!forms) {
0 0 if (!forms) {
0 0 if (!forms) {
4533 0 0 lemmas_forms.emplace_back(string(lemma.str, raw_lemma_len) + LemmaAddinfo::format(lemma_info, lemma_info_len));
0 0 lemmas_forms.emplace_back(string(lemma.str, raw_lemma_len) + LemmaAddinfo::format(lemma_info, lemma_info_len));
0 0 lemmas_forms.emplace_back(string(lemma.str, raw_lemma_len) + LemmaAddinfo::format(lemma_info, lemma_info_len));
0 0 lemmas_forms.emplace_back(string(lemma.str, raw_lemma_len) + LemmaAddinfo::format(lemma_info, lemma_info_len));
0 0 lemmas_forms.emplace_back(string(lemma.str, raw_lemma_len) + LemmaAddinfo::format(lemma_info, lemma_info_len));
0 0 lemmas_forms.emplace_back(string(lemma.str, raw_lemma_len) + LemmaAddinfo::format(lemma_info, lemma_info_len));
0 0 lemmas_forms.emplace_back(string(lemma.str, raw_lemma_len) + LemmaAddinfo::format(lemma_info, lemma_info_len));
4537 0 0 if (root_with_suffix.empty() && root_len + suffix.first.size()) {
0 0 if (root_with_suffix.empty() && root_len + suffix.first.size()) {
0 0 if (root_with_suffix.empty() && root_len + suffix.first.size()) {
0 0 if (root_with_suffix.empty() && root_len + suffix.first.size()) {
0 0 if (root_with_suffix.empty() && root_len + suffix.first.size()) {
0 0 if (root_with_suffix.empty() && root_len + suffix.first.size()) {
0 0 if (root_with_suffix.empty() && root_len + suffix.first.size()) {
0 0 if (root_with_suffix.empty() && root_len + suffix.first.size()) {
0 0 if (root_with_suffix.empty() && root_len + suffix.first.size()) {
4538 0 0 root_with_suffix.reserve(root_len + suffix.first.size());
0 0 root_with_suffix.reserve(root_len + suffix.first.size());
0 0 root_with_suffix.reserve(root_len + suffix.first.size());
4543 0 0 forms->emplace_back(root_with_suffix, tags[tag]);
0 0 forms->emplace_back(root_with_suffix, tags[tag]);
0 0 forms->emplace_back(root_with_suffix, tags[tag]);
4590 0 0 for (unsigned tag_filters_len = data.next_1B(); tag_filters_len; tag_filters_len--) {
4594 0 0 tag_filters.emplace_back(tag_filter.c_str());
4605 0 0 if (!form.len) return;
4609 0 0 middle_masks.reserve(form.len);
4611 0 0 for (unsigned initial = 0; initial < form.len; initial++) {
4614 0 0 if (initial) {
4616 0 0 if (!found) break;
4621 0 0 if (initial_mask) {
4622 0 0 middle_masks.resize(initial);
4623 0 0 middle_masks.emplace_back(initial_mask);
4624 0 0 for (unsigned middle = initial; middle < middle_masks.size(); middle++) {
4625 0 0 if (!middle_masks[middle]) continue;
4627 0 0 for (unsigned i = middle + 1; i < form.len; i++) {
4629 0 0 if (!found) break;
4630 0 0 if (unaligned_load(found)) {
4631 0 0 if (i + 1 > middle_masks.size()) middle_masks.resize(i + 1);
0 0 if (i + 1 > middle_masks.size()) middle_masks.resize(i + 1);
4637 0 0 if (middle > initial && middle < form.len ) {
0 0 if (middle > initial && middle < form.len ) {
4638 0 0 if (initial) {
4639 0 0 if (form_tmp.empty()) form_tmp.assign(form.str, form.str + form.len);
4643 0 0 dictionary.analyze(string_piece((initial ? form_tmp.data() : form.str) + middle - initial, form.len - middle + initial), lemmas);
0 0 dictionary.analyze(string_piece((initial ? form_tmp.data() : form.str) + middle - initial, form.len - middle + initial), lemmas);
4645 0 0 for (unsigned i = lemmas_ori_size; i < lemmas.size(); i++) {
4646 0 0 for (unsigned filter = 0; filter < tag_filters.size(); filter++)
4647 0 0 if ((middle_masks[middle] & (1<
0 0 if ((middle_masks[middle] & (1<
0 0 if ((middle_masks[middle] & (1<
4648 0 0 if (i == lemmas_new_size) {
4651 0 0 lemmas[lemmas_new_size].lemma.reserve(lemmas[i].lemma.size() + middle - initial);
4660 0 0 if (lemmas_new_size < lemmas.size()) lemmas.erase(lemmas.begin() + lemmas_new_size, lemmas.end());
4785 30 0 return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27);
4 26 return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27);
0 0 return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27);
0 0 return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27);
0 0 return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27);
0 0 return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27);
0 0 return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27);
0 0 return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27);
0 0 return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27);
0 0 return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27);
0 0 return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27);
0 0 return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27);
0 0 return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27);
0 0 return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27);
0 0 return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27);
0 0 return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27);
0 0 return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27);
0 0 return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27);
0 0 return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27);
0 0 return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27);
0 0 return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27);
0 0 return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27);
0 0 return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27);
0 0 return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27);
0 0 return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27);
0 0 return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27);
0 0 return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27);
0 0 return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27);
0 0 return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27);
0 0 return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27);
0 0 return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27);
0 0 return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27);
4845 0 0 czech_morpho(morpho_language language, unsigned version) : language(language), version(version) {}
0 0 czech_morpho(morpho_language language, unsigned version) : language(language), version(version) {}
0 0 czech_morpho(morpho_language language, unsigned version) : language(language), version(version) {}
4888 0 0 if (!compressor::load(is, data)) return false;
0 0 if (!compressor::load(is, data)) return false;
4892 0 0 unsigned tag_length = data.next_1B();
4893 0 0 if (tag_length < unknown_tag.size()) unknown_tag.erase(tag_length);
0 0 if (tag_length < unknown_tag.size()) unknown_tag.erase(tag_length);
4894 0 0 if (tag_length < number_tag.size()) number_tag.erase(tag_length);
0 0 if (tag_length < number_tag.size()) number_tag.erase(tag_length);
4895 0 0 if (tag_length < punctuation_tag.size()) punctuation_tag.erase(tag_length);
0 0 if (tag_length < punctuation_tag.size()) punctuation_tag.erase(tag_length);
4898 0 0 dictionary.load(data);
4902 0 0 if (data.next_1B()) {
0 0 if (data.next_1B()) {
4903 0 0 prefix_guesser.reset(new morpho_prefix_guesser(dictionary));
4904 0 0 prefix_guesser->load(data);
4909 0 0 if (data.next_1B()) {
0 0 if (data.next_1B()) {
4910 0 0 statistical_guesser.reset(new morpho_statistical_guesser());
4911 0 0 statistical_guesser->load(data);
4912 0 0 }
4923 0 0 if (form.len) {
4927 0 0 generate_casing_variants(form, form_uclc, form_lc);
4930 0 0 dictionary.analyze(form, lemmas);
4931 0 0 if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas);
0 0 if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas);
4932 0 0 if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas);
0 0 if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas);
4933 0 0 if (!lemmas.empty()) return NO_GUESSER;
4936 0 0 analyze_special(form, lemmas);
4937 0 0 if (!lemmas.empty()) return NO_GUESSER;
4940 0 0 if (guesser == GUESSER && prefix_guesser)
0 0 if (guesser == GUESSER && prefix_guesser)
0 0 if (guesser == GUESSER && prefix_guesser)
4941 0 0 prefix_guesser->analyze(form_lc.empty() ? form : form_lc, lemmas);
0 0 prefix_guesser->analyze(form_lc.empty() ? form : form_lc, lemmas);
4945 0 0 if (guesser == GUESSER && statistical_guesser) {
0 0 if (guesser == GUESSER && statistical_guesser) {
0 0 if (guesser == GUESSER && statistical_guesser) {
4946 0 0 if (form_uclc.empty() && form_lc.empty())
0 0 if (form_uclc.empty() && form_lc.empty())
0 0 if (form_uclc.empty() && form_lc.empty())
4947 0 0 statistical_guesser->analyze(form, lemmas, nullptr);
4949 0 0 morpho_statistical_guesser::used_rules used_rules; used_rules.reserve(3);
4950 0 0 statistical_guesser->analyze(form, lemmas, &used_rules);
4951 0 0 if (!form_uclc.empty()) statistical_guesser->analyze(form_uclc, lemmas, &used_rules);
0 0 if (!form_uclc.empty()) statistical_guesser->analyze(form_uclc, lemmas, &used_rules);
4952 0 0 if (!form_lc.empty()) statistical_guesser->analyze(form_lc, lemmas, &used_rules);
0 0 if (!form_lc.empty()) statistical_guesser->analyze(form_lc, lemmas, &used_rules);
4958 0 0 if (prefix_guesser_guesses) {
4961 0 0 return lemma_compare < 0 || (lemma_compare == 0 && a.tag < b.tag);
4964 0 0 return a.lemma == b.lemma && a.tag == b.tag;
0 0 return a.lemma == b.lemma && a.tag == b.tag;
4966 0 0 if (lemmas_end != lemmas.end()) lemmas.erase(lemmas_end, lemmas.end());
4969 0 0 if (!lemmas.empty()) return GUESSER;
4972 0 0 lemmas.emplace_back(string(form.str, form.len), unknown_tag);
4981 0 0 if (lemma.len) {
4982 0 0 if (dictionary.generate(lemma, filter, forms))
0 0 if (dictionary.generate(lemma, filter, forms))
4985 0 0 if (guesser == GUESSER && prefix_guesser)
0 0 if (guesser == GUESSER && prefix_guesser)
5006 0 0 return new czech_tokenizer(language, version, this);
5037 0 0 if (!form.len) return;
5045 0 0 if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(form.str, form.len);
5046 0 0 while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(form.str, form.len);
5047 0 0 if ((codepoint == '.' && form.len) || codepoint == ',') codepoint = utf8::decode(form.str, form.len);
0 0 if ((codepoint == '.' && form.len) || codepoint == ',') codepoint = utf8::decode(form.str, form.len);
0 0 if ((codepoint == '.' && form.len) || codepoint == ',') codepoint = utf8::decode(form.str, form.len);
5048 0 0 while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(form.str, form.len);
5049 0 0 if (any_digit && (codepoint == 'e' || codepoint == 'E')) {
0 0 if (any_digit && (codepoint == 'e' || codepoint == 'E')) {
5051 0 0 if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(form.str, form.len);
5053 0 0 while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(form.str, form.len);
5056 0 0 if (any_digit && !form.len && (!codepoint || codepoint == '.')) {
0 0 if (any_digit && !form.len && (!codepoint || codepoint == '.')) {
0 0 if (any_digit && !form.len && (!codepoint || codepoint == '.')) {
5057 0 0 lemmas.emplace_back(string(form_ori.str, form_ori.len), number_tag);
5058 0 0 } else if ((first < sizeof(punctuation_additional) && punctuation_additional[first]) ||
0 0 } else if ((first < sizeof(punctuation_additional) && punctuation_additional[first]) ||
0 0 } else if ((first < sizeof(punctuation_additional) && punctuation_additional[first]) ||
0 0 } else if ((first < sizeof(punctuation_additional) && punctuation_additional[first]) ||
5059 0 0 ((unicode::category(first) & unicode::P) && (first >= sizeof(punctuation_exceptions) || !punctuation_exceptions[first])))
0 0 ((unicode::category(first) & unicode::P) && (first >= sizeof(punctuation_exceptions) || !punctuation_exceptions[first])))
5060 0 0 lemmas.emplace_back(string(form_ori.str, form_ori.len), punctuation_tag);
5098 0 0 for (unsigned len = 1; len < lemma.len; len++) {
5099 0 0 if (len + 1 == lemma.len && (lemma.str[len] == '^' || lemma.str[len] == '+'))
0 0 if (len + 1 == lemma.len && (lemma.str[len] == '^' || lemma.str[len] == '+'))
5101 0 0 if (len + 1 < lemma.len && lemma.str[len] == '^') {
0 0 if (len + 1 < lemma.len && lemma.str[len] == '^') {
5103 0 0 for (unsigned i = len + 1; ok && i < lemma.len; i++)
0 0 for (unsigned i = len + 1; ok && i < lemma.len; i++)
5104 0 0 ok &= (lemma.str[i] >= 'A' && lemma.str[i] <= 'Z') ||
5105 0 0 (lemma.str[i] >= 'a' && lemma.str[i] <= 'z') ||
0 0 (lemma.str[i] >= 'a' && lemma.str[i] <= 'z') ||
5106 0 0 (i > len + 1 && lemma.str[i] == '-');
5107 0 0 if (ok) return len;
5130 0 0 for (size_t i = len; i < lemma.len; i++)
5137 0 0 if (data.empty()) return true;
5138 0 0 if (data.size() == 1 && data[0] == '^') return other_addinfo_len > 0 && other_addinfo[0] == '^';
0 0 if (data.size() == 1 && data[0] == '^') return other_addinfo_len > 0 && other_addinfo[0] == '^';
0 0 if (data.size() == 1 && data[0] == '^') return other_addinfo_len > 0 && other_addinfo[0] == '^';
0 0 if (data.size() == 1 && data[0] == '^') return other_addinfo_len > 0 && other_addinfo[0] == '^';
0 0 if (data.size() == 1 && data[0] == '^') return other_addinfo_len > 0 && other_addinfo[0] == '^';
5139 0 0 if (data.size() == 1 && data[0] == '+') return other_addinfo_len == 0;
0 0 if (data.size() == 1 && data[0] == '+') return other_addinfo_len == 0;
0 0 if (data.size() == 1 && data[0] == '+') return other_addinfo_len == 0;
5140 0 0 return data.size() == size_t(other_addinfo_len) && small_memeq(data.data(), other_addinfo, other_addinfo_len);
0 0 return data.size() == size_t(other_addinfo_len) && small_memeq(data.data(), other_addinfo, other_addinfo_len);
5160 0 0 class english_morpho_guesser {
0 0 class english_morpho_guesser {
0 0 class english_morpho_guesser {
0 0 class english_morpho_guesser {
0 0 class english_morpho_guesser {
0 0 class english_morpho_guesser {
0 0 class english_morpho_guesser {
0 0 class english_morpho_guesser {
0 0 class english_morpho_guesser {
0 0 class english_morpho_guesser {
0 0 class english_morpho_guesser {
0 0 class english_morpho_guesser {
0 0 class english_morpho_guesser {
0 0 class english_morpho_guesser {
0 0 class english_morpho_guesser {
0 0 class english_morpho_guesser {
0 0 class english_morpho_guesser {
0 0 class english_morpho_guesser {
0 0 class english_morpho_guesser {
5208 0 0 english_morpho(unsigned version) : version(version) {}
0 0 english_morpho(unsigned version) : version(version) {}
0 0 english_morpho(unsigned version) : version(version) {}
0 0 english_morpho(unsigned version) : version(version) {}
0 0 english_morpho(unsigned version) : version(version) {}
0 0 english_morpho(unsigned version) : version(version) {}
0 0 english_morpho(unsigned version) : version(version) {}
0 0 english_morpho(unsigned version) : version(version) {}
0 0 english_morpho(unsigned version) : version(version) {}
0 0 english_morpho(unsigned version) : version(version) {}
0 0 english_morpho(unsigned version) : version(version) {}
0 0 english_morpho(unsigned version) : version(version) {}
0 0 english_morpho(unsigned version) : version(version) {}
0 0 english_morpho(unsigned version) : version(version) {}
0 0 english_morpho(unsigned version) : version(version) {}
0 0 english_morpho(unsigned version) : version(version) {}
0 0 english_morpho(unsigned version) : version(version) {}
0 0 english_morpho(unsigned version) : version(version) {}
0 0 english_morpho(unsigned version) : version(version) {}
0 0 english_morpho(unsigned version) : version(version) {}
5282 0 0 if (!compressor::load(is, data)) return false;
0 0 if (!compressor::load(is, data)) return false;
5285 0 0 dictionary.load(data);
5286 0 0 morpho_guesser.load(data);
0 0 morpho_guesser.load(data);
5297 0 0 if (form.len) {
5301 0 0 generate_casing_variants(form, form_uclc, form_lc);
5304 0 0 dictionary.analyze(form, lemmas);
5305 0 0 if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas);
0 0 if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas);
5306 0 0 if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas);
0 0 if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas);
5307 0 0 if (!lemmas.empty())
5308 0 0 return guesser == NO_GUESSER || !morpho_guesser.analyze_proper_names(form, form_lc.empty() ? form : form_lc, lemmas) ? NO_GUESSER : GUESSER;
0 0 return guesser == NO_GUESSER || !morpho_guesser.analyze_proper_names(form, form_lc.empty() ? form : form_lc, lemmas) ? NO_GUESSER : GUESSER;
0 0 return guesser == NO_GUESSER || !morpho_guesser.analyze_proper_names(form, form_lc.empty() ? form : form_lc, lemmas) ? NO_GUESSER : GUESSER;
0 0 return guesser == NO_GUESSER || !morpho_guesser.analyze_proper_names(form, form_lc.empty() ? form : form_lc, lemmas) ? NO_GUESSER : GUESSER;
5311 0 0 analyze_special(form, lemmas);
5312 0 0 if (!lemmas.empty()) return NO_GUESSER;
5315 0 0 if (guesser == GUESSER)
5316 0 0 morpho_guesser.analyze(form, form_lc.empty() ? form : form_lc, lemmas);
0 0 morpho_guesser.analyze(form, form_lc.empty() ? form : form_lc, lemmas);
5317 0 0 if (!lemmas.empty()) return GUESSER;
5320 0 0 lemmas.emplace_back(string(form.str, form.len), unknown_tag);
5329 0 0 if (lemma.len) {
5330 0 0 if (dictionary.generate(lemma, filter, forms))
0 0 if (dictionary.generate(lemma, filter, forms))
5350 0 0 return new english_tokenizer(version <= 2 ? 1 : 2);
5357 0 0 if (!form.len) return;
5360 0 0 if (form.len == 1)
5364 0 0 case '?': lemmas.emplace_back(string(form.str, form.len), dot_tag); return;
5365 0 0 case ',': lemmas.emplace_back(string(form.str, form.len), comma_tag); return;
5366 0 0 case '#': lemmas.emplace_back(string(form.str, form.len), hash_tag); return;
5367 0 0 case '$': lemmas.emplace_back(string(form.str, form.len), dollar_tag); return;
5368 0 0 case '[': lemmas.emplace_back(string(form.str, form.len), sym_tag); return;
5369 0 0 case ']': lemmas.emplace_back(string(form.str, form.len), sym_tag); return;
5370 0 0 case '%': lemmas.emplace_back(string(form.str, form.len), jj_tag);
5371 0 0 lemmas.emplace_back(string(form.str, form.len), nn_tag); return;
5372 0 0 case '&': lemmas.emplace_back(string(form.str, form.len), cc_tag);
5373 0 0 lemmas.emplace_back(string(form.str, form.len), sym_tag); return;
5374 0 0 case '*': lemmas.emplace_back(string(form.str, form.len), sym_tag);
5375 0 0 lemmas.emplace_back(string(form.str, form.len), nn_tag); return;
5376 0 0 case '@': lemmas.emplace_back(string(form.str, form.len), sym_tag);
5377 0 0 lemmas.emplace_back(string(form.str, form.len), in_tag); return;
5378 0 0 case '\'': lemmas.emplace_back(string(form.str, form.len), close_quotation_tag);
5379 0 0 lemmas.emplace_back(string(form.str, form.len), pos_tag); return;
5386 0 0 if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(number.str, number.len);
5387 0 0 while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len);
5388 0 0 while (codepoint == ',') {
5390 0 0 if (unicode::category(utf8::decode(group.str, group.len) & ~unicode::N)) break;
5391 0 0 if (unicode::category(utf8::decode(group.str, group.len) & ~unicode::N)) break;
5392 0 0 if (unicode::category(utf8::decode(group.str, group.len) & ~unicode::N)) break;
5397 0 0 if (codepoint == '.' && number.len) {
0 0 if (codepoint == '.' && number.len) {
5399 0 0 while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len);
5401 0 0 if (version >= 2 && any_digit && codepoint == 's' && !number.len) {
0 0 if (version >= 2 && any_digit && codepoint == 's' && !number.len) {
0 0 if (version >= 2 && any_digit && codepoint == 's' && !number.len) {
5402 0 0 lemmas.emplace_back(string(form.str, form.len), number_tag);
5403 0 0 lemmas.emplace_back(string(form.str, form.len - 1), nns_tag);
5406 0 0 if (any_digit && (codepoint == 'e' || codepoint == 'E')) {
0 0 if (any_digit && (codepoint == 'e' || codepoint == 'E')) {
5408 0 0 if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(number.str, number.len);
5410 0 0 while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len);
5412 0 0 if (any_digit && !number.len && (!codepoint || codepoint == '.')) {
0 0 if (any_digit && !number.len && (!codepoint || codepoint == '.')) {
0 0 if (any_digit && !number.len && (!codepoint || codepoint == '.')) {
5413 0 0 lemmas.emplace_back(string(form.str, form.len), number_tag);
5414 0 0 lemmas.emplace_back(string(form.str, form.len), nnp_tag);
5415 0 0 if (form.len == 1 + (codepoint == '.') && *form.str >= '1' && *form.str <= '9')
0 0 if (form.len == 1 + (codepoint == '.') && *form.str >= '1' && *form.str <= '9')
0 0 if (form.len == 1 + (codepoint == '.') && *form.str >= '1' && *form.str <= '9')
0 0 if (form.len == 1 + (codepoint == '.') && *form.str >= '1' && *form.str <= '9')
5416 0 0 lemmas.emplace_back(string(form.str, form.len), ls_tag);
5423 0 0 while ((symbol || any_punctuation) && punctuation.len) {
0 0 while ((symbol || any_punctuation) && punctuation.len) {
5425 0 0 if (open_quotation) open_quotation = codepoint == '`' || unicode::category(codepoint) & unicode::Pi;
0 0 if (open_quotation) open_quotation = codepoint == '`' || unicode::category(codepoint) & unicode::Pi;
0 0 if (open_quotation) open_quotation = codepoint == '`' || unicode::category(codepoint) & unicode::Pi;
5426 0 0 if (close_quotation) close_quotation = codepoint == '\'' || codepoint == '"' || unicode::category(codepoint) & unicode::Pf;
0 0 if (close_quotation) close_quotation = codepoint == '\'' || codepoint == '"' || unicode::category(codepoint) & unicode::Pf;
0 0 if (close_quotation) close_quotation = codepoint == '\'' || codepoint == '"' || unicode::category(codepoint) & unicode::Pf;
5427 0 0 if (open_parenthesis) open_parenthesis = unicode::category(codepoint) & unicode::Ps;
5428 0 0 if (close_parenthesis) close_parenthesis = unicode::category(codepoint) & unicode::Pe;
5429 0 0 if (any_punctuation) any_punctuation = unicode::category(codepoint) & unicode::P;
5430 0 0 if (symbol) symbol = codepoint == '*' || unicode::category(codepoint) & unicode::S;
0 0 if (symbol) symbol = codepoint == '*' || unicode::category(codepoint) & unicode::S;
0 0 if (symbol) symbol = codepoint == '*' || unicode::category(codepoint) & unicode::S;
5432 0 0 if (!punctuation.len && open_quotation) { lemmas.emplace_back(string(form.str, form.len), open_quotation_tag); return; }
0 0 if (!punctuation.len && open_quotation) { lemmas.emplace_back(string(form.str, form.len), open_quotation_tag); return; }
0 0 if (!punctuation.len && open_quotation) { lemmas.emplace_back(string(form.str, form.len), open_quotation_tag); return; }
5433 0 0 if (!punctuation.len && close_quotation) { lemmas.emplace_back(string(form.str, form.len), close_quotation_tag); return; }
0 0 if (!punctuation.len && close_quotation) { lemmas.emplace_back(string(form.str, form.len), close_quotation_tag); return; }
0 0 if (!punctuation.len && close_quotation) { lemmas.emplace_back(string(form.str, form.len), close_quotation_tag); return; }
5434 0 0 if (!punctuation.len && open_parenthesis) { lemmas.emplace_back(string(form.str, form.len), open_parenthesis_tag); return; }
0 0 if (!punctuation.len && open_parenthesis) { lemmas.emplace_back(string(form.str, form.len), open_parenthesis_tag); return; }
0 0 if (!punctuation.len && open_parenthesis) { lemmas.emplace_back(string(form.str, form.len), open_parenthesis_tag); return; }
5435 0 0 if (!punctuation.len && close_parenthesis) { lemmas.emplace_back(string(form.str, form.len), close_parenthesis_tag); return; }
0 0 if (!punctuation.len && close_parenthesis) { lemmas.emplace_back(string(form.str, form.len), close_parenthesis_tag); return; }
0 0 if (!punctuation.len && close_parenthesis) { lemmas.emplace_back(string(form.str, form.len), close_parenthesis_tag); return; }
5436 0 0 if (!punctuation.len && symbol) { lemmas.emplace_back(string(form.str, form.len), sym_tag); return; }
0 0 if (!punctuation.len && symbol) { lemmas.emplace_back(string(form.str, form.len), sym_tag); return; }
0 0 if (!punctuation.len && symbol) { lemmas.emplace_back(string(form.str, form.len), sym_tag); return; }
5437 0 0 if (!punctuation.len && any_punctuation) { lemmas.emplace_back(string(form.str, form.len), punctuation_tag); return; }
0 0 if (!punctuation.len && any_punctuation) { lemmas.emplace_back(string(form.str, form.len), punctuation_tag); return; }
0 0 if (!punctuation.len && any_punctuation) { lemmas.emplace_back(string(form.str, form.len), punctuation_tag); return; }
5471 0 0 while (tags--) {
5473 0 0 exceptions_tags.emplace_back(string(data.next(len), len));
5609 0 0 for (unsigned len = data.next_1B(); len; len--) {
5615 0 0 if (exception) {
5618 0 0 for (unsigned len = data.next_1B(); len; len--) {
5621 0 0 for (unsigned tags = data.next_1B(); tags; tags--)
5622 0 0 lemmas.emplace_back(lemma, exceptions_tags[data.next_2B()]);
5629 0 0 for (unsigned prefix = 1; prefix <= form_lc.len; prefix++) {
5631 0 0 if (!found) break;
5632 0 0 if (found[NEGATION_LEN]) {
5633 0 0 if (form_lc.len - prefix >= found[TO_FOLLOW]) negation_len = found[NEGATION_LEN];
5639 0 0 add(JJ, lemma_lc, negation_len, lemmas);
5640 0 0 add(RB, lemma_lc, negation_len, lemmas);
5641 0 0 add(NN, lemma_lc, negation_len, lemmas);
5642 0 0 add_NNS(lemma_lc, negation_len, lemmas);
5659 0 0 if ( p == ( (form_lc.str + form_lc.len)) )
5666 0 0 if ( _klen > 0 ) {
5671 0 0 if ( _upper < _lower )
5675 0 0 if ( ( form_lc.str[form_lc.len - 1 - (p - form_lc.str)]) < *_mid )
5677 0 0 else if ( ( form_lc.str[form_lc.len - 1 - (p - form_lc.str)]) > *_mid )
5689 0 0 if ( _klen > 0 ) {
5694 0 0 if ( _upper < _lower )
5698 0 0 if ( ( form_lc.str[form_lc.len - 1 - (p - form_lc.str)]) < _mid[0] )
5700 0 0 else if ( ( form_lc.str[form_lc.len - 1 - (p - form_lc.str)]) > _mid[1] )
5714 0 0 if ( _tag_guesser_trans_actions[_trans] == 0 )
5719 0 0 while ( _nacts-- > 0 )
5724 0 0 { if (!added_JJR_RBR) added_JJR_RBR = true, add_JJR_RBR(lemma_lc, negation_len, lemmas); }
0 0 { if (!added_JJR_RBR) added_JJR_RBR = true, add_JJR_RBR(lemma_lc, negation_len, lemmas); }
5727 0 0 { if (!added_JJS_RBS) added_JJS_RBS = true, add_JJS_RBS(lemma_lc, negation_len, lemmas); }
0 0 { if (!added_JJS_RBS) added_JJS_RBS = true, add_JJS_RBS(lemma_lc, negation_len, lemmas); }
5730 0 0 { add_VBG(lemma_lc, lemmas); }
5733 0 0 { add_VBD_VBN(lemma_lc, lemmas); }
5736 0 0 { add_VBZ(lemma_lc, lemmas); }
5742 0 0 { if (!added_SYM) added_SYM = true, add(SYM, lemma_lc, lemmas); }
5745 0 0 { if (!added_CD) added_CD = true, add(CD, lemma_lc, lemmas); }
5751 0 0 if ( ++p != ( (form_lc.str + form_lc.len)) )
5754 0 0 if ( p == ( (form_lc.str + form_lc.len)) )
5758 0 0 while ( __nacts-- > 0 ) {
5759 0 0 switch ( *__acts++ ) {
5761 0 0 { if (!added_CD) added_CD = true, add(CD, lemma_lc, lemmas); }
5777 0 0 bool is_NNP = form.str != form_lc.str || (form.len && (*form.str == '\'' || (*form.str >= '0' && *form.str <= '9')));
0 0 bool is_NNP = form.str != form_lc.str || (form.len && (*form.str == '\'' || (*form.str >= '0' && *form.str <= '9')));
0 0 bool is_NNP = form.str != form_lc.str || (form.len && (*form.str == '\'' || (*form.str >= '0' && *form.str <= '9')));
0 0 bool is_NNP = form.str != form_lc.str || (form.len && (*form.str == '\'' || (*form.str >= '0' && *form.str <= '9')));
5779 0 0 if (!is_NNP && !is_NNPS) return false;
5782 0 0 for (auto&& lemma : lemmas) {
5786 0 0 if (!((is_NNP && !was_NNP) || (is_NNPS && !was_NNPS))) return false;
0 0 if (!((is_NNP && !was_NNP) || (is_NNPS && !was_NNPS))) return false;
5789 0 0 if (is_NNP && !was_NNP) add(NNP, lemma, lemmas);
5790 0 0 if (is_NNPS && !was_NNPS) add_NNPS(lemma, lemmas);
0 0 if (is_NNPS && !was_NNPS) add_NNPS(lemma, lemmas);
5795 0 0 lemmas.emplace_back(form, tag);
0 0 lemmas.emplace_back(form, tag);
0 0 lemmas.emplace_back(form, tag);
0 0 lemmas.emplace_back(form, tag);
0 0 lemmas.emplace_back(form, tag);
0 0 lemmas.emplace_back(form, tag);
0 0 lemmas.emplace_back(form, tag);
0 0 lemmas.emplace_back(form, tag);
0 0 lemmas.emplace_back(form, tag);
0 0 lemmas.emplace_back(form, tag);
0 0 lemmas.emplace_back(form, tag);
0 0 lemmas.emplace_back(form, tag);
5804 0 0 lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag);
0 0 lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag);
0 0 lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag);
0 0 lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag);
0 0 lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag);
0 0 lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag);
0 0 lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag);
0 0 lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag);
0 0 lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag);
0 0 lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag);
0 0 lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag);
0 0 lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag);
5906 0 0 if ( p == ( (form.c_str() + form.size())) )
5915 0 0 if ( _klen > 0 ) {
5920 0 0 if ( _upper < _lower )
5924 0 0 if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < *_mid )
5926 0 0 else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > *_mid )
5938 0 0 if ( _klen > 0 ) {
5943 0 0 if ( _upper < _lower )
5947 0 0 if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < _mid[0] )
5949 0 0 else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > _mid[1] )
5963 0 0 if ( _NNS_trans_actions[_trans] == 0 )
5968 0 0 while ( _nacts-- > 0 )
5973 0 0 { if (best > 'a') best = 'a', remove = 2, append = "an"; }
5976 0 0 { if (best > 'b') best = 'b', remove = 1, append = nullptr; }
5979 0 0 { if (best > 'c') best = 'c', remove = 3, append = "fe"; }
5982 0 0 { if (best > 'd') best = 'd', remove = 2, append = nullptr; }
5985 0 0 { if (best > 'e') best = 'e', remove = 1, append = nullptr; }
5988 0 0 { if (best > 'f') best = 'f', remove = 2, append = nullptr; }
5991 0 0 { if (best > 'g') best = 'g', remove = 1, append = nullptr; }
5994 0 0 { if (best > 'h') best = 'h', remove = 2, append = nullptr; }
5997 0 0 { if (best > 'i') best = 'i', remove = 1, append = nullptr; }
6000 0 0 { if (best > 'j') best = 'j', remove = 1, append = nullptr; }
6003 0 0 { if (best > 'k') best = 'k', remove = 2, append = nullptr; }
6006 0 0 { if (best > 'l') best = 'l', remove = 3, append = "y"; }
6009 0 0 { if (best > 'm') best = 'm', remove = 2, append = nullptr; }
6012 0 0 { if (best > 'n') best = 'n', remove = 1, append = nullptr; }
6018 0 0 if ( cs == 0 )
6020 0 0 if ( ++p != ( (form.c_str() + form.size())) )
6026 0 0 add(NNS, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas);
0 0 add(NNS, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas);
0 0 add(NNS, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas);
6152 0 0 if ( p == ( (form.c_str() + form.size())) )
6161 0 0 if ( _klen > 0 ) {
6166 0 0 if ( _upper < _lower )
6170 0 0 if ( ( form[form.size() - 1 - (p - form.c_str())]) < *_mid )
6172 0 0 else if ( ( form[form.size() - 1 - (p - form.c_str())]) > *_mid )
6184 0 0 if ( _klen > 0 ) {
6189 0 0 if ( _upper < _lower )
6193 0 0 if ( ( form[form.size() - 1 - (p - form.c_str())]) < _mid[0] )
6195 0 0 else if ( ( form[form.size() - 1 - (p - form.c_str())]) > _mid[1] )
6209 0 0 if ( _NNPS_trans_actions[_trans] == 0 )
6214 0 0 while ( _nacts-- > 0 )
6219 0 0 { if (best > 'a') best = 'a', remove = 2, append = "AN"; }
6222 0 0 { if (best > 'b') best = 'b', remove = 2, append = "an"; }
6225 0 0 { if (best > 'c') best = 'c', remove = 1, append = nullptr; }
6228 0 0 { if (best > 'd') best = 'd', remove = 3, append = "FE"; }
6231 0 0 { if (best > 'e') best = 'e', remove = 3, append = "fe"; }
6234 0 0 { if (best > 'f') best = 'f', remove = 2, append = nullptr; }
6237 0 0 { if (best > 'g') best = 'g', remove = 1, append = nullptr; }
6240 0 0 { if (best > 'h') best = 'h', remove = 2, append = nullptr; }
6243 0 0 { if (best > 'i') best = 'i', remove = 1, append = nullptr; }
6246 0 0 { if (best > 'j') best = 'j', remove = 2, append = nullptr; }
6249 0 0 { if (best > 'k') best = 'k', remove = 1, append = nullptr; }
6252 0 0 { if (best > 'l') best = 'l', remove = 1, append = nullptr; }
6255 0 0 { if (best > 'm') best = 'm', remove = 2, append = nullptr; }
6258 0 0 { if (best > 'n') best = 'n', remove = 3, append = "Y"; }
6261 0 0 { if (best > 'o') best = 'o', remove = 3, append = "y"; }
6264 0 0 { if (best > 'p') best = 'p', remove = 2, append = nullptr; }
6267 0 0 { if (best > 'q') best = 'q', remove = 1, append = nullptr; }
6273 0 0 if ( cs == 0 )
6275 0 0 if ( ++p != ( (form.c_str() + form.size())) )
6281 0 0 add(NNPS, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas);
0 0 add(NNPS, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas);
6581 0 0 if ( p == ( (form.c_str() + form.size())) )
6590 0 0 if ( _klen > 0 ) {
6595 0 0 if ( _upper < _lower )
6599 0 0 if ( ( form[form.size() - 1 - (p - form.c_str())]) < *_mid )
6601 0 0 else if ( ( form[form.size() - 1 - (p - form.c_str())]) > *_mid )
6613 0 0 if ( _klen > 0 ) {
6618 0 0 if ( _upper < _lower )
6622 0 0 if ( ( form[form.size() - 1 - (p - form.c_str())]) < _mid[0] )
6624 0 0 else if ( ( form[form.size() - 1 - (p - form.c_str())]) > _mid[1] )
6638 0 0 if ( _VBG_trans_actions[_trans] == 0 )
6643 0 0 while ( _nacts-- > 0 )
6648 0 0 { if (best > 'a') best = 'a', remove = 3, append = nullptr; }
6651 0 0 { if (best > 'b') best = 'b', remove = 3, append = "e"; }
6654 0 0 { if (best > 'c') best = 'c', remove = 3, append = nullptr; }
6657 0 0 { if (best > 'd') best = 'd', remove = 3, append = "e"; }
6660 0 0 { if (best > 'e') best = 'e', remove = 3, append = nullptr; }
6663 0 0 { if (best > 'f') best = 'f', remove = 3, append = "e"; }
6666 0 0 { if (best > 'g') best = 'g', remove = 3, append = nullptr; }
6669 0 0 { if (best > 'h') best = 'h', remove = 3, append = "e"; }
6672 0 0 { if (best > 'i') best = 'i', remove = 3, append = nullptr; }
6675 0 0 { if (best > 'j') best = 'j', remove = 3, append = "e"; }
6678 0 0 { if (best > 'k') best = 'k', remove = 3, append = nullptr; }
6681 0 0 { if (best > 'l') best = 'l', remove = 3, append = "e"; }
6684 0 0 { if (best > 'm') best = 'm', remove = 3, append = nullptr; }
6687 0 0 { if (best > 'n') best = 'n', remove = 3, append = "e"; }
6690 0 0 { if (best > 'o') best = 'o', remove = 3, append = nullptr; }
6693 0 0 { if (best > 'p') best = 'p', remove = 3, append = "e"; }
6696 0 0 { if (best > 'q') best = 'q', remove = 3, append = nullptr; }
6699 0 0 { if (best > 'r') best = 'r', remove = 3, append = "e"; }
6705 0 0 if ( cs == 0 )
6707 0 0 if ( ++p != ( (form.c_str() + form.size())) )
6710 0 0 if ( p == ( (form.c_str() + form.size())) )
6714 0 0 while ( __nacts-- > 0 ) {
6717 0 0 { if (best > 'c') best = 'c', remove = 3, append = nullptr; }
6720 0 0 { if (best > 'f') best = 'f', remove = 3, append = "e"; }
6723 0 0 { if (best > 'p') best = 'p', remove = 3, append = "e"; }
6732 0 0 add(VBG, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas);
0 0 add(VBG, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas);
7035 0 0 if ( p == ( (form.c_str() + form.size())) )
7044 0 0 if ( _klen > 0 ) {
7049 0 0 if ( _upper < _lower )
7053 0 0 if ( ( form[form.size() - 1 - (p - form.c_str())]) < *_mid )
7055 0 0 else if ( ( form[form.size() - 1 - (p - form.c_str())]) > *_mid )
7067 0 0 if ( _klen > 0 ) {
7072 0 0 if ( _upper < _lower )
7076 0 0 if ( ( form[form.size() - 1 - (p - form.c_str())]) < _mid[0] )
7078 0 0 else if ( ( form[form.size() - 1 - (p - form.c_str())]) > _mid[1] )
7092 0 0 if ( _VBD_VBN_trans_actions[_trans] == 0 )
7097 0 0 while ( _nacts-- > 0 )
7102 0 0 { if (best > 'a') best = 'a', remove = 1, append = nullptr; }
7105 0 0 { if (best > 'b') best = 'b', remove = 2, append = nullptr; }
7108 0 0 { if (best > 'c') best = 'c', remove = 1, append = nullptr; }
7111 0 0 { if (best > 'd') best = 'd', remove = 2, append = nullptr; }
7114 0 0 { if (best > 'e') best = 'e', remove = 1, append = nullptr; }
7117 0 0 { if (best > 'f') best = 'f', remove = 2, append = nullptr; }
7120 0 0 { if (best > 'h') best = 'h', remove = 2, append = nullptr; }
7123 0 0 { if (best > 'i') best = 'i', remove = 3, append = "y"; }
7126 0 0 { if (best > 'j') best = 'j', remove = 1, append = nullptr; }
7129 0 0 { if (best > 'k') best = 'k', remove = 2, append = nullptr; }
7132 0 0 { if (best > 'l') best = 'l', remove = 1, append = nullptr; }
7135 0 0 { if (best > 'm') best = 'm', remove = 2, append = nullptr; }
7138 0 0 { if (best > 'n') best = 'n', remove = 1, append = nullptr; }
7141 0 0 { if (best > 'o') best = 'o', remove = 2, append = nullptr; }
7144 0 0 { if (best > 'p') best = 'p', remove = 1, append = nullptr; }
7147 0 0 { if (best > 'q') best = 'q', remove = 2, append = nullptr; }
7150 0 0 { if (best > 'r') best = 'r', remove = 1, append = nullptr; }
7156 0 0 if ( cs == 0 )
7158 0 0 if ( ++p != ( (form.c_str() + form.size())) )
7161 0 0 if ( p == ( (form.c_str() + form.size())) )
7165 0 0 while ( __nacts-- > 0 ) {
7168 0 0 { if (best > 'd') best = 'd', remove = 2, append = nullptr; }
7171 0 0 { if (best > 'g') best = 'g', remove = 1, append = nullptr; }
7174 0 0 { if (best > 'j') best = 'j', remove = 1, append = nullptr; }
7183 0 0 add(VBD, VBN, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas);
0 0 add(VBD, VBN, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas);
7262 0 0 if ( p == ( (form.c_str() + form.size())) )
7271 0 0 if ( _klen > 0 ) {
7276 0 0 if ( _upper < _lower )
7280 0 0 if ( ( form[form.size() - 1 - (p - form.c_str())]) < *_mid )
7282 0 0 else if ( ( form[form.size() - 1 - (p - form.c_str())]) > *_mid )
7294 0 0 if ( _klen > 0 ) {
7299 0 0 if ( _upper < _lower )
7303 0 0 if ( ( form[form.size() - 1 - (p - form.c_str())]) < _mid[0] )
7305 0 0 else if ( ( form[form.size() - 1 - (p - form.c_str())]) > _mid[1] )
7319 0 0 if ( _VBZ_trans_actions[_trans] == 0 )
7324 0 0 while ( _nacts-- > 0 )
7329 0 0 { if (best > 'a') best = 'a', remove = 1, append = nullptr; }
7332 0 0 { if (best > 'b') best = 'b', remove = 2, append = nullptr; }
7335 0 0 { if (best > 'c') best = 'c', remove = 1, append = nullptr; }
7338 0 0 { if (best > 'd') best = 'd', remove = 2, append = nullptr; }
7341 0 0 { if (best > 'e') best = 'e', remove = 1, append = nullptr; }
7344 0 0 { if (best > 'f') best = 'f', remove = 2, append = nullptr; }
7347 0 0 { if (best > 'g') best = 'g', remove = 3, append = "y"; }
7350 0 0 { if (best > 'h') best = 'h', remove = 2, append = nullptr; }
7353 0 0 { if (best > 'i') best = 'i', remove = 1, append = nullptr; }
7359 0 0 if ( cs == 0 )
7361 0 0 if ( ++p != ( (form.c_str() + form.size())) )
7367 0 0 add(VBZ, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas);
0 0 add(VBZ, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas);
7493 0 0 if ( p == ( (form.c_str() + form.size())) )
7502 0 0 if ( _klen > 0 ) {
7507 0 0 if ( _upper < _lower )
7511 0 0 if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < *_mid )
7513 0 0 else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > *_mid )
7525 0 0 if ( _klen > 0 ) {
7530 0 0 if ( _upper < _lower )
7534 0 0 if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < _mid[0] )
7536 0 0 else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > _mid[1] )
7550 0 0 if ( _JJR_RBR_trans_actions[_trans] == 0 )
7555 0 0 while ( _nacts-- > 0 )
7560 0 0 { if (best > 'a') best = 'a', remove = 2, append = nullptr; }
7563 0 0 { if (best > 'b') best = 'b', remove = 3, append = nullptr; }
7566 0 0 { if (best > 'c') best = 'c', remove = 3, append = "y"; }
7569 0 0 { if (best > 'd') best = 'd', remove = 2, append = nullptr; }
7572 0 0 { if (best > 'e') best = 'e', remove = 1, append = nullptr; }
7575 0 0 { if (best > 'f') best = 'f', remove = 2, append = nullptr; }
7581 0 0 if ( cs == 0 )
7583 0 0 if ( ++p != ( (form.c_str() + form.size())) )
7589 0 0 add(JJR, RBR, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas);
0 0 add(JJR, RBR, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas);
0 0 add(JJR, RBR, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas);
7719 0 0 if ( p == ( (form.c_str() + form.size())) )
7728 0 0 if ( _klen > 0 ) {
7733 0 0 if ( _upper < _lower )
7737 0 0 if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < *_mid )
7739 0 0 else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > *_mid )
7751 0 0 if ( _klen > 0 ) {
7756 0 0 if ( _upper < _lower )
7760 0 0 if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < _mid[0] )
7762 0 0 else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > _mid[1] )
7776 0 0 if ( _JJS_RBS_trans_actions[_trans] == 0 )
7781 0 0 while ( _nacts-- > 0 )
7786 0 0 { if (best > 'a') best = 'a', remove = 3, append = nullptr; }
7789 0 0 { if (best > 'b') best = 'b', remove = 4, append = nullptr; }
7792 0 0 { if (best > 'c') best = 'c', remove = 4, append = "y"; }
7795 0 0 { if (best > 'd') best = 'd', remove = 3, append = nullptr; }
7798 0 0 { if (best > 'e') best = 'e', remove = 2, append = nullptr; }
7801 0 0 { if (best > 'f') best = 'f', remove = 3, append = nullptr; }
7807 0 0 if ( cs == 0 )
7809 0 0 if ( ++p != ( (form.c_str() + form.size())) )
7815 0 0 add(JJS, RBS, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas);
0 0 add(JJS, RBS, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas);
0 0 add(JJS, RBS, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas);
7898 0 0 if (!compressor::load(is, data)) return false;
0 0 if (!compressor::load(is, data)) return false;
7902 0 0 unsigned length = data.next_1B();
7903 0 0 unknown_tag.assign(data.next(length), length);
0 0 unknown_tag.assign(data.next(length), length);
7914 0 0 if (form.len) {
7917 0 0 while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++;
0 0 while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++;
7918 0 0 if (lemmatags.len) lemmatags.len--, lemmatags.str++;
7921 0 0 while (lemmatags.len) {
7923 0 0 while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++;
0 0 while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++;
7924 0 0 if (!lemmatags.len) break;
7929 0 0 while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++;
0 0 while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++;
7931 0 0 if (lemmatags.len) lemmatags.len--, lemmatags.str++;
7933 0 0 lemmas.emplace_back(string(lemma_start, lemma_len), string(tag_start, tag_len));
7936 0 0 if (!lemmas.empty()) return NO_GUESSER;
7939 0 0 lemmas.emplace_back(string(form.str, form.len), unknown_tag);
7948 0 0 if (lemma.len) {
7951 0 0 while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++;
0 0 while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++;
7953 0 0 if (formtags.len) formtags.len--, formtags.str++;
7957 0 0 while (formtags.len) {
7959 0 0 while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++;
0 0 while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++;
7960 0 0 if (!formtags.len) break;
7965 0 0 while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++;
0 0 while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++;
7967 0 0 if (formtags.len) formtags.len--, formtags.str++;
7971 0 0 if (filter.matches(tag.c_str())) {
7972 0 0 if (forms.empty()) forms.emplace_back(string(real_lemma.str, real_lemma.len));
0 0 if (forms.empty()) forms.emplace_back(string(real_lemma.str, real_lemma.len));
7973 0 0 forms.back().forms.emplace_back(string(form_start, form_len), tag);
7977 0 0 if (any_result) return NO_GUESSER;
7985 0 0 while (lemma_len < lemma.len && lemma.str[lemma_len] != ' ') lemma_len++;
0 0 while (lemma_len < lemma.len && lemma.str[lemma_len] != ' ') lemma_len++;
7991 0 0 while (lemma_len < lemma.len && lemma.str[lemma_len] != ' ') lemma_len++;
0 0 while (lemma_len < lemma.len && lemma.str[lemma_len] != ' ') lemma_len++;
7997 0 0 while (form_len < form.len && form.str[form_len] != ' ') form_len++;
0 0 while (form_len < form.len && form.str[form_len] != ' ') form_len++;
8118 1 0 if (!compressor::load(is, data)) return false;
1 0 if (!compressor::load(is, data)) return false;
8122 1 0 unsigned length = data.next_1B();
8123 1 0 unknown_tag.assign(data.next(length), length);
8124 1 0 length = data.next_1B();
8125 1 0 number_tag.assign(data.next(length), length);
8126 1 0 length = data.next_1B();
8127 1 0 punctuation_tag.assign(data.next(length), length);
8128 1 0 length = data.next_1B();
8129 1 0 symbol_tag.assign(data.next(length), length);
8132 1 0 dictionary.load(data);
8136 1 0 if (data.next_1B()) {
1 0 if (data.next_1B()) {
8137 1 0 statistical_guesser.reset(new morpho_statistical_guesser());
8138 1 0 statistical_guesser->load(data);
8139 0 0 }
8150 7 0 if (form.len) {
8154 7 0 generate_casing_variants(form, form_uclc, form_lc);
8157 7 0 dictionary.analyze(form, lemmas);
8158 0 7 if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas);
0 0 if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas);
8159 1 6 if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas);
1 0 if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas);
8160 0 7 if (!lemmas.empty()) return NO_GUESSER;
8163 0 0 analyze_special(form, lemmas);
8164 0 0 if (!lemmas.empty()) return NO_GUESSER;
8167 0 0 if (guesser == GUESSER && statistical_guesser) {
0 0 if (guesser == GUESSER && statistical_guesser) {
0 0 if (guesser == GUESSER && statistical_guesser) {
8168 0 0 if (form_uclc.empty() && form_lc.empty())
0 0 if (form_uclc.empty() && form_lc.empty())
0 0 if (form_uclc.empty() && form_lc.empty())
8169 0 0 statistical_guesser->analyze(form, lemmas, nullptr);
8171 0 0 morpho_statistical_guesser::used_rules used_rules; used_rules.reserve(3);
8172 0 0 statistical_guesser->analyze(form, lemmas, &used_rules);
8173 0 0 if (!form_uclc.empty()) statistical_guesser->analyze(form_uclc, lemmas, &used_rules);
0 0 if (!form_uclc.empty()) statistical_guesser->analyze(form_uclc, lemmas, &used_rules);
8174 0 0 if (!form_lc.empty()) statistical_guesser->analyze(form_lc, lemmas, &used_rules);
0 0 if (!form_lc.empty()) statistical_guesser->analyze(form_lc, lemmas, &used_rules);
8177 0 0 if (!lemmas.empty()) return GUESSER;
8180 0 0 lemmas.emplace_back(string(form.str, form.len), unknown_tag);
8189 0 0 if (lemma.len) {
8190 0 0 if (dictionary.generate(lemma, filter, forms))
0 0 if (dictionary.generate(lemma, filter, forms))
8220 0 0 if (!form.len) return;
8228 0 0 if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(number.str, number.len);
8229 0 0 while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len);
8230 0 0 if ((codepoint == '.' && number.len) || codepoint == ',') codepoint = utf8::decode(number.str, number.len);
0 0 if ((codepoint == '.' && number.len) || codepoint == ',') codepoint = utf8::decode(number.str, number.len);
0 0 if ((codepoint == '.' && number.len) || codepoint == ',') codepoint = utf8::decode(number.str, number.len);
8231 0 0 while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len);
8232 0 0 if (any_digit && (codepoint == 'e' || codepoint == 'E')) {
0 0 if (any_digit && (codepoint == 'e' || codepoint == 'E')) {
8234 0 0 if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(number.str, number.len);
8236 0 0 while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len);
8239 0 0 if (any_digit && !number.len && (!codepoint || codepoint == '.')) {
0 0 if (any_digit && !number.len && (!codepoint || codepoint == '.')) {
0 0 if (any_digit && !number.len && (!codepoint || codepoint == '.')) {
8240 0 0 lemmas.emplace_back(string(form.str, form.len), number_tag);
8247 0 0 while (form.len) {
8249 0 0 punctuation = punctuation && unicode::category(codepoint) & unicode::P;
0 0 punctuation = punctuation && unicode::category(codepoint) & unicode::P;
8250 0 0 symbol = symbol && unicode::category(codepoint) & unicode::S;
0 0 symbol = symbol && unicode::category(codepoint) & unicode::S;
8252 0 0 if (punctuation)
8253 0 0 lemmas.emplace_back(string(form_ori.str, form_ori.len), punctuation_tag);
8254 0 0 else if (symbol)
8255 0 0 lemmas.emplace_back(string(form_ori.str, form_ori.len), symbol_tag);
8302 0 0 construct(std::map(map.begin(), map.end()), load_factor, entry_encode);
0 0 construct(std::map(map.begin(), map.end()), load_factor, entry_encode);
0 0 construct(std::map(map.begin(), map.end()), load_factor, entry_encode);
0 0 construct(std::map(map.begin(), map.end()), load_factor, entry_encode);
8308 0 0 std::map enlarged_map(map.begin(), map.end());
0 0 std::map enlarged_map(map.begin(), map.end());
8310 0 0 for (auto&& entry : map) {
0 0 for (auto&& entry : map) {
8313 0 0 if (!key.empty() && add_prefixes)
0 0 if (!key.empty() && add_prefixes)
0 0 if (!key.empty() && add_prefixes)
0 0 if (!key.empty() && add_prefixes)
0 0 if (!key.empty() && add_prefixes)
0 0 if (!key.empty() && add_prefixes)
8314 0 0 for (unsigned i = key.size() - 1; i; i--)
0 0 for (unsigned i = key.size() - 1; i; i--)
8315 0 0 enlarged_map[key.substr(0, i)];
0 0 enlarged_map[key.substr(0, i)];
0 0 enlarged_map[key.substr(0, i)];
0 0 enlarged_map[key.substr(0, i)];
8317 0 0 if (!key.empty() && add_suffixes)
0 0 if (!key.empty() && add_suffixes)
0 0 if (!key.empty() && add_suffixes)
0 0 if (!key.empty() && add_suffixes)
0 0 if (!key.empty() && add_suffixes)
0 0 if (!key.empty() && add_suffixes)
8318 0 0 for (unsigned i = 1; i < key.size(); i++)
0 0 for (unsigned i = 1; i < key.size(); i++)
8319 0 0 enlarged_map[key.substr(i)];
0 0 enlarged_map[key.substr(i)];
0 0 enlarged_map[key.substr(i)];
0 0 enlarged_map[key.substr(i)];
8322 0 0 construct(enlarged_map, load_factor, entry_encode);
0 0 construct(enlarged_map, load_factor, entry_encode);
8333 0 0 for (auto&& elem : map) {
0 0 for (auto&& elem : map) {
0 0 for (auto&& elem : map) {
0 0 for (auto&& elem : map) {
8335 0 0 if (len >= sizes.size()) sizes.resize(len + 1);
0 0 if (len >= sizes.size()) sizes.resize(len + 1);
0 0 if (len >= sizes.size()) sizes.resize(len + 1);
0 0 if (len >= sizes.size()) sizes.resize(len + 1);
0 0 if (len >= sizes.size()) sizes.resize(len + 1);
0 0 if (len >= sizes.size()) sizes.resize(len + 1);
0 0 if (len >= sizes.size()) sizes.resize(len + 1);
0 0 if (len >= sizes.size()) sizes.resize(len + 1);
8338 0 0 for (auto&& size : sizes)
0 0 for (auto&& size : sizes)
0 0 for (auto&& size : sizes)
0 0 for (auto&& size : sizes)
8339 0 0 resize(unsigned(load_factor * size));
0 0 resize(unsigned(load_factor * size));
0 0 resize(unsigned(load_factor * size));
0 0 resize(unsigned(load_factor * size));
8342 0 0 for (auto&& elem : map) {
0 0 for (auto&& elem : map) {
0 0 for (auto&& elem : map) {
0 0 for (auto&& elem : map) {
8343 0 0 binary_encoder enc;
0 0 binary_encoder enc;
0 0 binary_encoder enc;
0 0 binary_encoder enc;
8344 0 0 entry_encode(enc, elem.second);
0 0 entry_encode(enc, elem.second);
0 0 entry_encode(enc, elem.second);
0 0 entry_encode(enc, elem.second);
8347 0 0 done_adding();
0 0 done_adding();
0 0 done_adding();
0 0 done_adding();
8350 0 0 for (auto&& elem : map) {
0 0 for (auto&& elem : map) {
0 0 for (auto&& elem : map) {
0 0 for (auto&& elem : map) {
8351 0 0 binary_encoder enc;
0 0 binary_encoder enc;
0 0 binary_encoder enc;
0 0 binary_encoder enc;
8352 0 0 entry_encode(enc, elem.second);
0 0 entry_encode(enc, elem.second);
0 0 entry_encode(enc, elem.second);
0 0 entry_encode(enc, elem.second);
8361 0 0 for (auto&& hash : hashes)
8420 1 0 return unique_ptr(new T(std::forward(args)...));
0 0 return unique_ptr(new T(std::forward(args)...));
0 0 return unique_ptr(new T(std::forward(args)...));
8460 0 0 if (!*str) return;
8462 0 0 for (auto&& child : children)
8463 0 0 if (child.first == *str) {
8468 0 0 children.emplace_back(*str, new_unique_ptr());
8476 0 0 find_candidate_prefix(max_suffix_len, current, best, best_length, 0);
8480 0 0 if (depth < max_suffix_len && length > best_length) {
0 0 if (depth < max_suffix_len && length > best_length) {
8484 0 0 for (auto&& child : children) {
8486 0 0 child.second->find_candidate_prefix(max_suffix_len, current, best, best_length, children.size() == 1 ? length + 1 : 1);
8498 0 0 if (str.size() >= lengths.size()) lengths.resize(str.size() + 1);
8504 0 0 for (auto&& set : lengths)
8513 0 0 this->lemma = lemma.substr(0, addinfo.parse(lemma, true));
8524 0 0 bool operator<(const lemma_form_info& other) const { return form < other.form || (form == other.form && clas < other.clas); }
0 0 bool operator<(const lemma_form_info& other) const { return form < other.form || (form == other.form && clas < other.clas); }
0 0 bool operator<(const lemma_form_info& other) const { return form < other.form || (form == other.form && clas < other.clas); }
8528 0 0 bool operator<(const lemma_info& other) const { return lemma < other.lemma || (lemma == other.lemma && addinfo.data < other.addinfo.data); }
8547 0 0 dict.load(is, max_suffix_len);
8550 0 0 dict.encode(enc);
8559 0 0 while(raw.next_lemma(lemma, forms)) {
0 0 while(raw.next_lemma(lemma, forms)) {
8563 0 0 if (forms_end != forms.end()) {
8569 0 0 lemmas.emplace_back(lemma);
8571 0 0 lemmas_hist.add(lemma_info.lemma);
8574 0 0 while (!forms.empty()) {
8576 0 0 for (auto&& form : forms)
8577 0 0 t.add(form.first.c_str());
8580 0 0 string prefix = t.find_candidate_prefix(max_suffix_len);
8584 0 0 while (start != forms.end() && start->first.compare(0, prefix.size(), prefix) != 0) start++;
0 0 while (start != forms.end() && start->first.compare(0, prefix.size(), prefix) != 0) start++;
0 0 while (start != forms.end() && start->first.compare(0, prefix.size(), prefix) != 0) start++;
0 0 while (start != forms.end() && start->first.compare(0, prefix.size(), prefix) != 0) start++;
8585 0 0 if (start == forms.end()) training_failure("Internal error when generating classes, cannot find prefix '" << prefix << "'!");
0 0 if (start == forms.end()) training_failure("Internal error when generating classes, cannot find prefix '" << prefix << "'!");
0 0 if (start == forms.end()) training_failure("Internal error when generating classes, cannot find prefix '" << prefix << "'!");
8587 0 0 while (end != forms.end() && end->first.compare(0, prefix.size(), prefix) == 0) end++;
0 0 while (end != forms.end() && end->first.compare(0, prefix.size(), prefix) == 0) end++;
0 0 while (end != forms.end() && end->first.compare(0, prefix.size(), prefix) == 0) end++;
0 0 while (end != forms.end() && end->first.compare(0, prefix.size(), prefix) == 0) end++;
8591 0 0 while (common_prefix < int(start->first.size()) && start->first[common_prefix] == (end-1)->first[common_prefix]) common_prefix++;
0 0 while (common_prefix < int(start->first.size()) && start->first[common_prefix] == (end-1)->first[common_prefix]) common_prefix++;
0 0 while (common_prefix < int(start->first.size()) && start->first[common_prefix] == (end-1)->first[common_prefix]) common_prefix++;
8594 0 0 for (auto form = start; form != end; form++) {
8595 0 0 if (!clas.empty()) clas.push_back('\t');
0 0 if (!clas.empty()) clas.push_back('\t');
8596 0 0 clas.append(form->first, common_prefix, string::npos);
8597 0 0 clas.push_back('\t');
8603 0 0 if (class_it.second) {
8605 0 0 for (auto form = start; form != end; form++) {
8607 0 0 if (tag >= int(tags.size())) tags.emplace_back(form->second);
0 0 if (tag >= int(tags.size())) tags.emplace_back(form->second);
8608 0 0 suffixes[form->first.substr(common_prefix)][class_id].emplace_back(tag);
0 0 suffixes[form->first.substr(common_prefix)][class_id].emplace_back(tag);
0 0 suffixes[form->first.substr(common_prefix)][class_id].emplace_back(tag);
8613 0 0 lemma_info.forms.emplace_back(start->first.substr(0, common_prefix), class_id);
0 0 lemma_info.forms.emplace_back(start->first.substr(0, common_prefix), class_id);
8614 0 0 forms_hist.add(lemma_info.forms.back().form);
8630 0 0 for (auto&& lemma : lemmas) {
8632 0 0 while (prev[cpl] && prev[cpl] == lemma.lemma[cpl]) cpl++;
0 0 while (prev[cpl] && prev[cpl] == lemma.lemma[cpl]) cpl++;
0 0 while (prev[cpl] && prev[cpl] == lemma.lemma[cpl]) cpl++;
8634 0 0 enc.add_1B(prev.length() - cpl);
8635 0 0 enc.add_1B(lemma.lemma.size() - cpl);
8636 0 0 enc.add_data(lemma.lemma.substr(cpl));
8637 0 0 enc.add_1B(lemma.addinfo.data.size());
8639 0 0 enc.add_1B(lemma.forms.size());
8642 0 0 for (auto&& lemma_form : lemma.forms) {
8644 0 0 for (unsigned prev_from = 0; prev_from < prev_form.size(); prev_from++)
8645 0 0 for (unsigned form_from = 0; form_from < lemma_form.form.size(); form_from++) {
8647 0 0 while (prev_from + len < prev_form.size() && form_from + len < lemma_form.form.size() && prev_form[prev_from+len] == lemma_form.form[form_from+len]) len++;
0 0 while (prev_from + len < prev_form.size() && form_from + len < lemma_form.form.size() && prev_form[prev_from+len] == lemma_form.form[form_from+len]) len++;
0 0 while (prev_from + len < prev_form.size() && form_from + len < lemma_form.form.size() && prev_form[prev_from+len] == lemma_form.form[form_from+len]) len++;
0 0 while (prev_from + len < prev_form.size() && form_from + len < lemma_form.form.size() && prev_form[prev_from+len] == lemma_form.form[form_from+len]) len++;
8648 0 0 if (len > best_len) best_prev_from = prev_from, best_form_from = form_from, best_len = len;
8652 0 0 enc.add_1B(REMOVE_START * (best_prev_from>0) + REMOVE_END * (best_prev_from+best_len
0 0 enc.add_1B(REMOVE_START * (best_prev_from>0) + REMOVE_END * (best_prev_from+best_len
0 0 enc.add_1B(REMOVE_START * (best_prev_from>0) + REMOVE_END * (best_prev_from+best_len
0 0 enc.add_1B(REMOVE_START * (best_prev_from>0) + REMOVE_END * (best_prev_from+best_len
8654 0 0 if (best_prev_from > 0) enc.add_1B(best_prev_from);
0 0 if (best_prev_from > 0) enc.add_1B(best_prev_from);
8655 0 0 if (best_prev_from + best_len < prev_form.size()) enc.add_1B(prev_form.size() - best_prev_from - best_len);
0 0 if (best_prev_from + best_len < prev_form.size()) enc.add_1B(prev_form.size() - best_prev_from - best_len);
8656 0 0 if (best_form_from > 0) {
8657 0 0 enc.add_1B(best_form_from);
8658 0 0 enc.add_data(lemma_form.form.substr(0, best_form_from));
8660 0 0 if (best_form_from + best_len < lemma_form.form.size()) {
8661 0 0 enc.add_1B(lemma_form.form.size() - best_form_from - best_len);
8662 0 0 enc.add_data(lemma_form.form.substr(best_form_from + best_len));
8664 0 0 enc.add_2B(lemma_form.clas);
8673 0 0 enc.add_2B(tags.size());
8674 0 0 for (auto&& tag : tags) {
8675 0 0 enc.add_1B(tag.size());
8680 0 0 persistent_unordered_map(suffixes, 5, false, true, [](binary_encoder& enc, const map>& suffix) {
8682 0 0 for (auto&& clas : suffix)
8685 0 0 for (auto&& clas : suffix) {
8686 0 0 enc.add_2B(tags - prev_tags < (1<<16) ? uint16_t(tags) : tags);
8690 0 0 enc.add_2B(tags - prev_tags < (1<<16) ? uint16_t(tags) : tags);
8691 0 0 for (auto&& clas : suffix)
8692 0 0 for (auto&& tag : clas.second)
8694 0 0 }).save(enc);
8761 0 0 enc.add_1B(tags.unknown_tag.size());
8763 0 0 enc.add_1B(tags.number_tag.size());
8765 0 0 enc.add_1B(tags.punctuation_tag.size());
8767 0 0 enc.add_1B(tags.symbol_tag.size());
8771 0 0 morpho_dictionary_encoder::encode(in_dictionary, max_suffix_len, enc);
8774 0 0 enc.add_1B(bool(in_statistical_guesser));
8775 0 0 if (in_statistical_guesser) {
8777 0 0 morpho_statistical_guesser_encoder::encode(in_statistical_guesser, enc);
8782 0 0 if (!compressor::save(out_morpho, enc)) training_failure("Cannot compress and write dictionary to file!");
0 0 if (!compressor::save(out_morpho, enc)) training_failure("Cannot compress and write dictionary to file!");
0 0 if (!compressor::save(out_morpho, enc)) training_failure("Cannot compress and write dictionary to file!");
0 0 if (!compressor::save(out_morpho, enc)) training_failure("Cannot compress and write dictionary to file!");
8851 0 0 if (res->load(is)) return res.release();
0 0 if (res->load(is)) return res.release();
8860 0 0 3);
0 0 3);
8861 0 0 if (res->load(is)) return res.release();
0 0 if (res->load(is)) return res.release();
8867 0 0 if (res->load(is)) return res.release();
0 0 if (res->load(is)) return res.release();
8873 1 0 if (res->load(is)) return res.release();
1 0 if (res->load(is)) return res.release();
8879 0 0 if (res->load(is)) return res.release();
0 0 if (res->load(is)) return res.release();
8885 0 0 if (!derinet->load(is)) return nullptr;
0 0 if (!derinet->load(is)) return nullptr;
8887 0 0 unique_ptr dictionary(load(is));
8888 0 0 if (!dictionary) return nullptr;
8899 0 0 ifstream f(path_from_utf8(fname).c_str(), ifstream::binary);
8900 0 0 if (!f) return nullptr;
8902 0 0 return load(f);
8929 6 1 for (auto&& tag : tags) {
8931 397 6 for (unsigned i = 0; i < tag.size(); i++)
8942 0 0 if (!used) return false;
8944 0 0 for (auto&& used_rule : *used)
8945 0 0 if (used_rule == rule)
8957 0 0 string rule_label; rule_label.reserve(12);
8959 0 0 for (; suffix_len < form.len; suffix_len++) {
8960 0 0 rule_label.push_back(form.str[form.len - (suffix_len + 1)]);
8961 0 0 if (!rules.at(rule_label.c_str(), rule_label.size(), [](pointer_decoder& data){ data.next(data.next_2B()); }))
8965 0 0 for (suffix_len++; suffix_len--; ) {
8967 0 0 rule_label.push_back(' ');
8971 0 0 for (unsigned prefix_len = 0; prefix_len + suffix_len <= form.len; prefix_len++) {
8972 0 0 if (prefix_len) rule_label.push_back(form.str[prefix_len - 1]);
0 0 if (prefix_len) rule_label.push_back(form.str[prefix_len - 1]);
8974 0 0 if (!found) break;
8975 0 0 if (*(found += sizeof(uint16_t))) {
8981 0 0 if (rule) {
8983 0 0 if (rule_label.size() > 1 && !contains(used, rule_label)) { // ignore rule ' '
0 0 if (rule_label.size() > 1 && !contains(used, rule_label)) { // ignore rule ' '
0 0 if (rule_label.size() > 1 && !contains(used, rule_label)) { // ignore rule ' '
8984 0 0 if (used) used->push_back(rule_label);
0 0 if (used) used->push_back(rule_label);
8985 0 0 for (int rules_len = *rule++; rules_len; rules_len--) {
8992 0 0 if (pref_del_len + suff_del_len > form.len ||
0 0 if (pref_del_len + suff_del_len > form.len ||
8993 0 0 (pref_del_len && !small_memeq(pref_del, form.str, pref_del_len)) ||
0 0 (pref_del_len && !small_memeq(pref_del, form.str, pref_del_len)) ||
8994 0 0 (suff_del_len && !small_memeq(suff_del, form.str + form.len - suff_del_len, suff_del_len)) ||
0 0 (suff_del_len && !small_memeq(suff_del, form.str + form.len - suff_del_len, suff_del_len)) ||
0 0 (suff_del_len && !small_memeq(suff_del, form.str + form.len - suff_del_len, suff_del_len)) ||
8999 0 0 lemma.reserve(form.len + pref_add_len - pref_del_len + suff_add_len - suff_del_len);
9000 0 0 if (pref_add_len) lemma.append(pref_add, pref_add_len);
0 0 if (pref_add_len) lemma.append(pref_add, pref_add_len);
9001 0 0 if (pref_del_len + suff_del_len < form.len) lemma.append(form.str + pref_del_len, form.len - pref_del_len - suff_del_len);
0 0 if (pref_del_len + suff_del_len < form.len) lemma.append(form.str + pref_del_len, form.len - pref_del_len - suff_del_len);
9002 0 0 if (suff_add_len) lemma.append(suff_add, suff_add_len);
0 0 if (suff_add_len) lemma.append(suff_add, suff_add_len);
9003 0 0 while (tags_len--)
9004 0 0 lemmas.emplace_back(lemma, this->tags[unaligned_load_inc(tags)]);
9012 0 0 if (lemmas.size() == lemmas_initial_size)
9013 0 0 if (!contains(used, string())) {
9014 0 0 if (used) used->push_back(string());
9015 0 0 lemmas.emplace_back(string(form.str, form.len), tags[default_tag]);
9050 0 0 if (text.empty()) return;
9053 0 0 for (string::size_type next; (next = text.find(sep, index)) != string::npos; index = next + 1)
9061 53 0 if (!text.len) return;
9064 68 53 for (const char* next; (next = (const char*) memchr(str, sep, text.str + text.len - str)); str = next + 1)
9095 0 0 if (!getline(is, line)) training_failure("Missing first line with default tag in statistical guesser file");
0 0 if (!getline(is, line)) training_failure("Missing first line with default tag in statistical guesser file");
0 0 if (!getline(is, line)) training_failure("Missing first line with default tag in statistical guesser file");
0 0 if (!getline(is, line)) training_failure("Missing first line with default tag in statistical guesser file");
9097 0 0 if (unsigned(statistical_guesser_default) >= tags.size()) tags.emplace_back(line.data());
0 0 if (unsigned(statistical_guesser_default) >= tags.size()) tags.emplace_back(line.data());
9099 0 0 while (getline(is, line)) {
0 0 while (getline(is, line)) {
9100 0 0 split(line, '\t', tokens);
9101 0 0 if (tokens.size() < 3 || (tokens.size() % 2) != 1) training_failure("Cannot parse line " << line << " in statistical guesser file!");
0 0 if (tokens.size() < 3 || (tokens.size() % 2) != 1) training_failure("Cannot parse line " << line << " in statistical guesser file!");
0 0 if (tokens.size() < 3 || (tokens.size() % 2) != 1) training_failure("Cannot parse line " << line << " in statistical guesser file!");
0 0 if (tokens.size() < 3 || (tokens.size() % 2) != 1) training_failure("Cannot parse line " << line << " in statistical guesser file!");
0 0 if (tokens.size() < 3 || (tokens.size() % 2) != 1) training_failure("Cannot parse line " << line << " in statistical guesser file!");
9104 0 0 split(tokens[0], ' ', affixes);
9105 0 0 if (affixes.size() != 2) training_failure("Cannot parse prefix_suffix '" << tokens[0] << "' in statistical guesser file!");
0 0 if (affixes.size() != 2) training_failure("Cannot parse prefix_suffix '" << tokens[0] << "' in statistical guesser file!");
0 0 if (affixes.size() != 2) training_failure("Cannot parse prefix_suffix '" << tokens[0] << "' in statistical guesser file!");
9108 0 0 auto& rules = statistical_guesser[affixes[1] + ' ' + affixes[0]];
0 0 auto& rules = statistical_guesser[affixes[1] + ' ' + affixes[0]];
9109 0 0 for (unsigned i = 1; i < tokens.size(); i+= 2) {
9111 0 0 split(tokens[i], ' ', replacements);
9112 0 0 if (replacements.size() != 4) training_failure("Cannot parse replacement rule '" << tokens[i] << "' in statistical guesser file!");
0 0 if (replacements.size() != 4) training_failure("Cannot parse replacement rule '" << tokens[i] << "' in statistical guesser file!");
0 0 if (replacements.size() != 4) training_failure("Cannot parse replacement rule '" << tokens[i] << "' in statistical guesser file!");
9115 0 0 split(tokens[i+1], ' ', rule_tags);
9117 0 0 for (auto&& rule_tag : rule_tags) {
9119 0 0 if (unsigned(tag) >= tags.size()) tags.emplace_back(rule_tag);
0 0 if (unsigned(tag) >= tags.size()) tags.emplace_back(rule_tag);
9120 0 0 decoded_tags.emplace_back(tag);
9123 0 0 rules.emplace_back(replacements, decoded_tags);
9128 0 0 enc.add_2B(tags.size());
9129 0 0 for (auto&& tag : tags) {
9130 0 0 enc.add_1B(tag.size());
9133 0 0 enc.add_2B(statistical_guesser_default);
9137 0 0 e.add_1B(rules.size());
9138 0 0 for (auto&& rule : rules) {
9139 0 0 if (rule.first.size() != 4) training_failure("Replacement rule not of size 4 in statistical guesser!");
0 0 if (rule.first.size() != 4) training_failure("Replacement rule not of size 4 in statistical guesser!");
0 0 if (rule.first.size() != 4) training_failure("Replacement rule not of size 4 in statistical guesser!");
9140 0 0 for (auto&& affix : rule.first) {
9141 0 0 e.add_1B(affix.size());
9144 0 0 e.add_1B(rule.second.size());
9145 0 0 for (auto&& tag : rule.second)
9146 0 0 e.add_2B(tag);
9148 0 0 enc.add_2B(e.data.size());
9150 0 0 }).save(enc);
0 0 }).save(enc);
9211 0 0 for (string line; getline(is, line);) {
0 0 for (string line; getline(is, line);) {
9212 0 0 if (line.empty()) continue;
9214 0 0 split(line, '\t', tokens);
9215 0 0 if (tokens.size() != 3) training_failure("The guesser training line '" << line << "' does not contain three columns!");
0 0 if (tokens.size() != 3) training_failure("The guesser training line '" << line << "' does not contain three columns!");
0 0 if (tokens.size() != 3) training_failure("The guesser training line '" << line << "' does not contain three columns!");
9216 0 0 if (tokens[0].empty() || tokens[1].empty() || tokens[2].empty()) training_failure("The guesser training line '" << line << "' contains an empty column!");
0 0 if (tokens[0].empty() || tokens[1].empty() || tokens[2].empty()) training_failure("The guesser training line '" << line << "' contains an empty column!");
0 0 if (tokens[0].empty() || tokens[1].empty() || tokens[2].empty()) training_failure("The guesser training line '" << line << "' contains an empty column!");
0 0 if (tokens[0].empty() || tokens[1].empty() || tokens[2].empty()) training_failure("The guesser training line '" << line << "' contains an empty column!");
0 0 if (tokens[0].empty() || tokens[1].empty() || tokens[2].empty()) training_failure("The guesser training line '" << line << "' contains an empty column!");
0 0 if (tokens[0].empty() || tokens[1].empty() || tokens[2].empty()) training_failure("The guesser training line '" << line << "' contains an empty column!");
9221 0 0 if ((lemma_case == CASE_LC && (form_case == CASE_UCLC || form_case == CASE_UC)) ||
0 0 if ((lemma_case == CASE_LC && (form_case == CASE_UCLC || form_case == CASE_UC)) ||
0 0 if ((lemma_case == CASE_LC && (form_case == CASE_UCLC || form_case == CASE_UC)) ||
9223 0 0 set_casing(tokens[0], lemma_case, form);
9228 0 0 data.emplace_back(form, tokens[1], tokens[2]);
9233 0 0 for (auto&& instance : data)
9234 0 0 if (!instance.form_prefix.empty())
9238 0 0 for (auto&& prefix : prefixes_with_forms)
9239 0 0 if (prefix.second.size() >= min_prefix_count)
9240 0 0 prefixes_with_counts.emplace_back(unsigned(prefix.second.size()), prefix.first);
9242 0 0 if (prefixes_with_counts.size() > max_prefixes) {
9244 0 0 prefixes_with_counts.resize(max_prefixes);
9249 0 0 for (auto&& prefix : prefixes_with_counts)
9257 0 0 for (auto&& instance : data) {
9263 0 0 for (auto&& prefix : prefixes)
9264 0 0 if (prefix.size() > prefix_length && instance.form.compare(0, prefix.size(), prefix) == 0)
0 0 if (prefix.size() > prefix_length && instance.form.compare(0, prefix.size(), prefix) == 0)
0 0 if (prefix.size() > prefix_length && instance.form.compare(0, prefix.size(), prefix) == 0)
0 0 if (prefix.size() > prefix_length && instance.form.compare(0, prefix.size(), prefix) == 0)
9267 0 0 tag_lemma_rule.assign(instance.lemma_rule).append("\t").append(instance.tag);
9270 0 0 for (unsigned length = 0, utf8_length = 0; length < suffix_len && suffix(instance.form, utf8_length); length++) {
0 0 for (unsigned length = 0, utf8_length = 0; length < suffix_len && suffix(instance.form, utf8_length); length++) {
0 0 for (unsigned length = 0, utf8_length = 0; length < suffix_len && suffix(instance.form, utf8_length); length++) {
9271 0 0 prefix_suffix.assign(instance.form, 0, prefix_length).append(" ").append(instance.form, instance.form.size() - utf8_length, utf8_length);
0 0 prefix_suffix.assign(instance.form, 0, prefix_length).append(" ").append(instance.form, instance.form.size() - utf8_length, utf8_length);
0 0 prefix_suffix.assign(instance.form, 0, prefix_length).append(" ").append(instance.form, instance.form.size() - utf8_length, utf8_length);
9279 0 0 for (auto&& tag : tags)
9280 0 0 if (tag.second.size() > most_frequent_tag_count)
9289 0 0 for (auto&& suffix : suffixes) {
9290 0 0 for (auto&& prefix : prefixes) {
9295 0 0 for (int prefix_len = int(prefix.size()); prefix_len >= 0; prefix_len -= prefix.empty() ? 1 : prefix.size()) {
0 0 for (int prefix_len = int(prefix.size()); prefix_len >= 0; prefix_len -= prefix.empty() ? 1 : prefix.size()) {
9296 0 0 for (int suffix_len = int(suffix.size()); rules_counts.size() < rules_per_suffix && suffix_len > 0; suffix_len--) {
0 0 for (int suffix_len = int(suffix.size()); rules_counts.size() < rules_per_suffix && suffix_len > 0; suffix_len--) {
0 0 for (int suffix_len = int(suffix.size()); rules_counts.size() < rules_per_suffix && suffix_len > 0; suffix_len--) {
9297 0 0 rule_key.assign(prefix, 0, prefix_len).append(" ").append(suffix, suffix.size() - suffix_len, suffix_len);
0 0 rule_key.assign(prefix, 0, prefix_len).append(" ").append(suffix, suffix.size() - suffix_len, suffix_len);
0 0 rule_key.assign(prefix, 0, prefix_len).append(" ").append(suffix, suffix.size() - suffix_len, suffix_len);
9298 0 0 if (!rules.count(rule_key)) continue;
9301 0 0 for (auto&& entry : rules[rule_key])
9302 0 0 if (!rules_set.count(entry.first)) {
9303 0 0 rules_counts.emplace_back(unsigned(entry.second.size()), entry.first);
9309 0 0 if (rules_counts.size() >= rules_per_suffix) {
9310 0 0 rules_counts.resize(rules_per_suffix);
9315 0 0 if (rules_set.empty()) break;
9317 0 0 if (!rules_set.empty()) {
9319 0 0 output.assign(prefix).append(" ").append(suffix);
9320 0 0 for (unsigned i = 0; i < rules_counts.size(); i++) {
9323 0 0 output.append("\t").append(rules_counts[i].second, 0, tab).append("\t").append(rules_counts[i].second, tab + 1, string::npos);
0 0 output.append("\t").append(rules_counts[i].second, 0, tab).append("\t").append(rules_counts[i].second, tab + 1, string::npos);
0 0 output.append("\t").append(rules_counts[i].second, 0, tab).append("\t").append(rules_counts[i].second, tab + 1, string::npos);
0 0 output.append("\t").append(rules_counts[i].second, 0, tab).append("\t").append(rules_counts[i].second, tab + 1, string::npos);
9326 0 0 for (unsigned start = i; i+1 < rules_counts.size() && rules_counts[i+1].second.compare(0, tab + 1, rules_counts[start].second, 0, tab + 1) == 0; i++)
0 0 for (unsigned start = i; i+1 < rules_counts.size() && rules_counts[i+1].second.compare(0, tab + 1, rules_counts[start].second, 0, tab + 1) == 0; i++)
0 0 for (unsigned start = i; i+1 < rules_counts.size() && rules_counts[i+1].second.compare(0, tab + 1, rules_counts[start].second, 0, tab + 1) == 0; i++)
0 0 for (unsigned start = i; i+1 < rules_counts.size() && rules_counts[i+1].second.compare(0, tab + 1, rules_counts[start].second, 0, tab + 1) == 0; i++)
9327 0 0 output.append(" ").append(rules_counts[i+1].second, tab + 1, string::npos);
0 0 output.append(" ").append(rules_counts[i+1].second, tab + 1, string::npos);
9342 0 0 for (int offset = -int(lemma.size() - 1); offset < int(form.size()) - 1; offset++) {
9345 0 0 for (unsigned length = 0; form_offset < form.size() && lemma_offset < lemma.size(); form_offset++, lemma_offset++)
0 0 for (unsigned length = 0; form_offset < form.size() && lemma_offset < lemma.size(); form_offset++, lemma_offset++)
0 0 for (unsigned length = 0; form_offset < form.size() && lemma_offset < lemma.size(); form_offset++, lemma_offset++)
9346 0 0 if (form[form_offset] == lemma[lemma_offset]) {
9347 0 0 if (++length > length_best && utf8::valid(form.c_str() + form_offset + 1 - length, length))
0 0 if (++length > length_best && utf8::valid(form.c_str() + form_offset + 1 - length, length))
0 0 if (++length > length_best && utf8::valid(form.c_str() + form_offset + 1 - length, length))
9354 0 0 form_prefix.assign(form, 0, lemma_best == 0 ? form_best : 0);
0 0 form_prefix.assign(form, 0, lemma_best == 0 ? form_best : 0);
9355 0 0 lemma_rule.assign(form, 0, form_best).append(" ").append(lemma, 0, lemma_best).append(" ")
0 0 lemma_rule.assign(form, 0, form_best).append(" ").append(lemma, 0, lemma_best).append(" ")
0 0 lemma_rule.assign(form, 0, form_best).append(" ").append(lemma, 0, lemma_best).append(" ")
0 0 lemma_rule.assign(form, 0, form_best).append(" ").append(lemma, 0, lemma_best).append(" ")
9356 0 0 .append(form, form_best + length_best, string::npos).append(" ").append(lemma, lemma_best + length_best, string::npos);
0 0 .append(form, form_best + length_best, string::npos).append(" ").append(lemma, lemma_best + length_best, string::npos);
0 0 .append(form, form_best + length_best, string::npos).append(" ").append(lemma, lemma_best + length_best, string::npos);
9364 0 0 for (auto&& chr : utf8::decoder(word)) {
9368 0 0 if (allow_nonletters && index >= 2 && cat & ~unicode::L) continue;
0 0 if (allow_nonletters && index >= 2 && cat & ~unicode::L) continue;
9369 0 0 if (cat & ~unicode::L) return CASE_OTHER;
9371 0 0 if (index == 0) {
9372 0 0 c = cat & unicode::Ll ? CASE_LC : CASE_UC;
9373 0 0 } else if (c == CASE_UC && index == 1) {
9374 0 0 c = cat & unicode::Ll ? CASE_UCLC : CASE_UC;
9375 0 0 } else if (c == CASE_UC) {
9376 0 0 if (cat & ~unicode::Lut) return CASE_OTHER;
9378 0 0 if (cat & ~unicode::Ll) return CASE_OTHER;
9390 0 0 for (auto&& chr : utf8::decoder(original)) {
9391 0 0 utf8::append(word, (c == CASE_UC || (c == CASE_UCLC && first)) ? unicode::uppercase(chr) : unicode::lowercase(chr));
0 0 utf8::append(word, (c == CASE_UC || (c == CASE_UCLC && first)) ? unicode::uppercase(chr) : unicode::lowercase(chr));
9400 0 0 while (additional + length <= word.size() && !utf8::valid(word.c_str() + word.size() - length - additional, additional))
0 0 while (additional + length <= word.size() && !utf8::valid(word.c_str() + word.size() - length - additional, additional))
0 0 while (additional + length <= word.size() && !utf8::valid(word.c_str() + word.size() - length - additional, additional))
9403 0 0 if (additional + length > word.size()) return false;
9427 0 0 if (line.empty()) {
9428 0 0 if (!getline(in, line))
9431 0 0 if (tokens.size() != 3) training_failure("Line " << line << " does not have three columns!");
0 0 if (tokens.size() != 3) training_failure("Line " << line << " does not have three columns!");
0 0 if (tokens.size() != 3) training_failure("Line " << line << " does not have three columns!");
9435 0 0 if (seen_lemmas.count(lemma))
9436 0 0 training_failure("Raw morphological dictionary contains lemma '" << lemma << "' multiple times - all forms of one lemma must be in continuous region!");
0 0 training_failure("Raw morphological dictionary contains lemma '" << lemma << "' multiple times - all forms of one lemma must be in continuous region!");
9441 0 0 while (getline(in, line)) {
9443 0 0 if (tokens.size() != 3) training_failure("Line " << line << " does not have three columns!");
0 0 if (tokens.size() != 3) training_failure("Line " << line << " does not have three columns!");
0 0 if (tokens.size() != 3) training_failure("Line " << line << " does not have three columns!");
9445 0 0 if (lemma != tokens[0]) break;
9470 0 0 if (!filter) return;
9472 0 0 wildcard.assign(filter);
9475 0 0 for (int tag_pos = 0, filter_pos = 0; filter[filter_pos]; tag_pos++, filter_pos++) {
9476 0 0 if (filter[filter_pos] == '?') continue;
9477 0 0 if (filter[filter_pos] == '[') {
9481 0 0 if (filter[filter_pos] == '^') negate = true, filter_pos++;
9484 0 0 for (bool first = true; filter[filter_pos] && (first || filter[filter_pos] != ']'); first = false)
0 0 for (bool first = true; filter[filter_pos] && (first || filter[filter_pos] != ']'); first = false)
0 0 for (bool first = true; filter[filter_pos] && (first || filter[filter_pos] != ']'); first = false)
9487 0 0 filters.emplace_back(tag_pos, negate, chars_start, filter_pos - chars_start);
9488 0 0 if (!filter[filter_pos]) break;
9490 0 0 filters.emplace_back(tag_pos, false, filter_pos, 1);
9543 0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
0 0 return it ? unaligned_load(it) : elementary_feature_unknown;
10 0 return it ? unaligned_load(it) : elementary_feature_unknown;
8 2 return it ? unaligned_load(it) : elementary_feature_unknown;
10 0 return it ? unaligned_load(it) : elementary_feature_unknown;
5 14 return it ? unaligned_load(it) : elementary_feature_unknown;
7 0 return it ? unaligned_load(it) : elementary_feature_unknown;
2 0 return it ? unaligned_load(it) : elementary_feature_unknown;
2 0 return it ? unaligned_load(it) : elementary_feature_unknown;
2 0 return it ? unaligned_load(it) : elementary_feature_unknown;
2 0 return it ? unaligned_load(it) : elementary_feature_unknown;
2 0 return it ? unaligned_load(it) : elementary_feature_unknown;
2 0 return it ? unaligned_load(it) : elementary_feature_unknown;
2 0 return it ? unaligned_load(it) : elementary_feature_unknown;
2 0 return it ? unaligned_load(it) : elementary_feature_unknown;
2 0 return it ? unaligned_load(it) : elementary_feature_unknown;
2 0 return it ? unaligned_load(it) : elementary_feature_unknown;
2 0 return it ? unaligned_load(it) : elementary_feature_unknown;
2 0 return it ? unaligned_load(it) : elementary_feature_unknown;
2 0 return it ? unaligned_load(it) : elementary_feature_unknown;
2 0 return it ? unaligned_load(it) : elementary_feature_unknown;
2 0 return it ? unaligned_load(it) : elementary_feature_unknown;
2 0 return it ? unaligned_load(it) : elementary_feature_unknown;
2 0 return it ? unaligned_load(it) : elementary_feature_unknown;
2 0 return it ? unaligned_load(it) : elementary_feature_unknown;
9551 1 0 if (!compressor::load(is, data)) return false;
1 0 if (!compressor::load(is, data)) return false;
9554 1 0 maps.resize(data.next_1B());
1 0 maps.resize(data.next_1B());
9555 27 1 for (auto&& map : maps)
9556 27 0 map.load(data);
0 0 map.load(data);
9598 1171 0 if (value < 0x80) *where++ = value;
9599 0 0 else if (value < 0x4000) *where++ = (value >> 7) | 0x80u, *where++ = value & 0x7Fu;
9600 0 0 else if (value < 0x200000) *where++ = (value >> 14) | 0x80u, *where++ = ((value >> 7) & 0x7Fu) | 0x80u, *where++ = value & 0x7Fu;
9601 0 0 else if (value < 0x10000000) *where++ = (value >> 21) | 0x80u, *where++ = ((value >> 14) & 0x7Fu) | 0x80u, *where++ = ((value >> 7) & 0x7Fu) | 0x80u, *where++ = value & 0x7Fu;
9608 0 0 while (((unsigned char)(*from)) & 0x80u) value = (value << 7) | (((unsigned char)(*from++)) ^ 0x80u);
0 0 while (((unsigned char)(*from)) & 0x80u) value = (value << 7) | (((unsigned char)(*from++)) ^ 0x80u);
9643 0 0 struct feature_sequence {
0 0 struct feature_sequence {
9649 0 0 class feature_sequences {
0 0 class feature_sequences {
1 0 class feature_sequences {
0 0 class feature_sequences {
0 0 class feature_sequences {
9678 0 0 return it ? unaligned_load(it) : 0;
0 0 return it ? unaligned_load(it) : 0;
270 76 return it ? unaligned_load(it) : 0;
9687 1 0 if (!elementary.load(is)) return false;
0 0 if (!elementary.load(is)) return false;
0 0 if (!elementary.load(is)) return false;
9690 1 0 if (!compressor::load(is, data)) return false;
1 0 if (!compressor::load(is, data)) return false;
0 0 if (!compressor::load(is, data)) return false;
0 0 if (!compressor::load(is, data)) return false;
0 0 if (!compressor::load(is, data)) return false;
0 0 if (!compressor::load(is, data)) return false;
9693 1 0 sequences.resize(data.next_1B());
1 0 sequences.resize(data.next_1B());
0 0 sequences.resize(data.next_1B());
0 0 sequences.resize(data.next_1B());
0 0 sequences.resize(data.next_1B());
0 0 sequences.resize(data.next_1B());
9694 74 1 for (auto&& sequence : sequences) {
0 0 for (auto&& sequence : sequences) {
0 0 for (auto&& sequence : sequences) {
9695 74 0 sequence.dependant_range = data.next_4B();
0 0 sequence.dependant_range = data.next_4B();
0 0 sequence.dependant_range = data.next_4B();
9696 74 0 sequence.elements.resize(data.next_1B());
74 0 sequence.elements.resize(data.next_1B());
0 0 sequence.elements.resize(data.next_1B());
0 0 sequence.elements.resize(data.next_1B());
0 0 sequence.elements.resize(data.next_1B());
0 0 sequence.elements.resize(data.next_1B());
9697 154 74 for (auto&& element : sequence.elements) {
0 0 for (auto&& element : sequence.elements) {
0 0 for (auto&& element : sequence.elements) {
9698 154 0 element.type = elementary_feature_type(data.next_4B());
0 0 element.type = elementary_feature_type(data.next_4B());
0 0 element.type = elementary_feature_type(data.next_4B());
9699 154 0 element.elementary_index = data.next_4B();
0 0 element.elementary_index = data.next_4B();
0 0 element.elementary_index = data.next_4B();
9700 154 0 element.sequence_index = data.next_4B();
0 0 element.sequence_index = data.next_4B();
0 0 element.sequence_index = data.next_4B();
9704 1 0 scores.resize(data.next_1B());
1 0 scores.resize(data.next_1B());
0 0 scores.resize(data.next_1B());
0 0 scores.resize(data.next_1B());
0 0 scores.resize(data.next_1B());
0 0 scores.resize(data.next_1B());
9705 74 1 for (auto&& score : scores)
0 0 for (auto&& score : scores)
0 0 for (auto&& score : scores)
9706 74 0 score.load(data);
0 0 score.load(data);
0 0 score.load(data);
0 0 score.load(data);
0 0 score.load(data);
0 0 score.load(data);
9726 0 0 cache_element(int elements) : key(vli::max_length() * elements), key_size(0), score(0) {}
0 0 cache_element(int elements) : key(vli::max_length() * elements), key_size(0), score(0) {}
0 0 cache_element(int elements) : key(vli::max_length() * elements), key_size(0), score(0) {}
0 0 cache_element(int elements) : key(vli::max_length() * elements), key_size(0), score(0) {}
9734 0 0 caches.reserve(self.sequences.size());
0 0 caches.reserve(self.sequences.size());
1 0 caches.reserve(self.sequences.size());
0 0 caches.reserve(self.sequences.size());
9736 0 0 for (auto&& sequence : self.sequences) {
0 0 for (auto&& sequence : self.sequences) {
74 1 for (auto&& sequence : self.sequences) {
0 0 for (auto&& sequence : self.sequences) {
9737 0 0 caches.emplace_back(int(sequence.elements.size()));
0 0 caches.emplace_back(int(sequence.elements.size()));
74 0 caches.emplace_back(int(sequence.elements.size()));
0 0 caches.emplace_back(int(sequence.elements.size()));
9738 0 0 if (int(sequence.elements.size()) > max_sequence_elements) max_sequence_elements = sequence.elements.size();
0 0 if (int(sequence.elements.size()) > max_sequence_elements) max_sequence_elements = sequence.elements.size();
2 72 if (int(sequence.elements.size()) > max_sequence_elements) max_sequence_elements = sequence.elements.size();
0 0 if (int(sequence.elements.size()) > max_sequence_elements) max_sequence_elements = sequence.elements.size();
9739 0 0 for (auto&& element : sequence.elements)
0 0 for (auto&& element : sequence.elements)
154 74 for (auto&& element : sequence.elements)
0 0 for (auto&& element : sequence.elements)
9740 0 0 if (element.type == PER_TAG && 1 - element.sequence_index > max_window_size)
0 0 if (element.type == PER_TAG && 1 - element.sequence_index > max_window_size)
0 0 if (element.type == PER_TAG && 1 - element.sequence_index > max_window_size)
0 0 if (element.type == PER_TAG && 1 - element.sequence_index > max_window_size)
92 62 if (element.type == PER_TAG && 1 - element.sequence_index > max_window_size)
2 90 if (element.type == PER_TAG && 1 - element.sequence_index > max_window_size)
0 0 if (element.type == PER_TAG && 1 - element.sequence_index > max_window_size)
0 0 if (element.type == PER_TAG && 1 - element.sequence_index > max_window_size)
9743 0 0 key.resize(max_sequence_elements * vli::max_length());
0 0 key.resize(max_sequence_elements * vli::max_length());
1 0 key.resize(max_sequence_elements * vli::max_length());
0 0 key.resize(max_sequence_elements * vli::max_length());
9744 0 0 window.resize(max_window_size);
0 0 window.resize(max_window_size);
1 0 window.resize(max_window_size);
0 0 window.resize(max_window_size);
9755 0 0 if (forms.size() > c.elementary_per_form.size()) c.elementary_per_form.resize(forms.size() * 2);
0 0 if (forms.size() > c.elementary_per_form.size()) c.elementary_per_form.resize(forms.size() * 2);
1 0 if (forms.size() > c.elementary_per_form.size()) c.elementary_per_form.resize(forms.size() * 2);
0 0 if (forms.size() > c.elementary_per_form.size()) c.elementary_per_form.resize(forms.size() * 2);
9756 0 0 if (forms.size() > c.elementary_per_tag.size()) c.elementary_per_tag.resize(forms.size() * 2);
0 0 if (forms.size() > c.elementary_per_tag.size()) c.elementary_per_tag.resize(forms.size() * 2);
1 0 if (forms.size() > c.elementary_per_tag.size()) c.elementary_per_tag.resize(forms.size() * 2);
0 0 if (forms.size() > c.elementary_per_tag.size()) c.elementary_per_tag.resize(forms.size() * 2);
9757 0 0 for (unsigned i = 0; i < forms.size(); i++)
0 0 for (unsigned i = 0; i < forms.size(); i++)
7 1 for (unsigned i = 0; i < forms.size(); i++)
0 0 for (unsigned i = 0; i < forms.size(); i++)
9758 0 0 if (analyses[i].size() > c.elementary_per_tag[i].size())
0 0 if (analyses[i].size() > c.elementary_per_tag[i].size())
7 0 if (analyses[i].size() > c.elementary_per_tag[i].size())
0 0 if (analyses[i].size() > c.elementary_per_tag[i].size())
9766 0 0 for (auto&& cache : c.caches)
0 0 for (auto&& cache : c.caches)
74 1 for (auto&& cache : c.caches)
0 0 for (auto&& cache : c.caches)
9772 0 0 elementary.compute_dynamic_features((*c.analyses)[form_index][tag_index], c.elementary_per_form[form_index], c.elementary_per_tag[form_index][tag_index], form_index > 0 ? prev_dynamic : nullptr, dynamic);
0 0 elementary.compute_dynamic_features((*c.analyses)[form_index][tag_index], c.elementary_per_form[form_index], c.elementary_per_tag[form_index][tag_index], form_index > 0 ? prev_dynamic : nullptr, dynamic);
12 3 elementary.compute_dynamic_features((*c.analyses)[form_index][tag_index], c.elementary_per_form[form_index], c.elementary_per_tag[form_index][tag_index], form_index > 0 ? prev_dynamic : nullptr, dynamic);
0 0 elementary.compute_dynamic_features((*c.analyses)[form_index][tag_index], c.elementary_per_form[form_index], c.elementary_per_tag[form_index][tag_index], form_index > 0 ? prev_dynamic : nullptr, dynamic);
9778 0 0 for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++)
0 0 for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++)
0 0 for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++)
0 0 for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++)
0 0 for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++)
0 0 for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++)
36 7 for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++)
6 30 for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++)
30 13 for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++)
0 0 for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++)
0 0 for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++)
0 0 for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++)
9783 0 0 for (unsigned i = 0; i < sequences.size(); i++) {
0 0 for (unsigned i = 0; i < sequences.size(); i++) {
658 8 for (unsigned i = 0; i < sequences.size(); i++) {
0 0 for (unsigned i = 0; i < sequences.size(); i++) {
9784 0 0 if (tags_unchanged >= sequences[i].dependant_range)
0 0 if (tags_unchanged >= sequences[i].dependant_range)
653 5 if (tags_unchanged >= sequences[i].dependant_range)
0 0 if (tags_unchanged >= sequences[i].dependant_range)
9788 0 0 for (unsigned j = 0; j < sequences[i].elements.size(); j++) {
0 0 for (unsigned j = 0; j < sequences[i].elements.size(); j++) {
1345 479 for (unsigned j = 0; j < sequences[i].elements.size(); j++) {
0 0 for (unsigned j = 0; j < sequences[i].elements.size(); j++) {
9794 0 0 value = form_index + element.sequence_index < 0 || unsigned(form_index + element.sequence_index) >= c.forms->size() ? elementary_feature_empty : c.elementary_per_form[form_index + element.sequence_index].values[element.elementary_index];
0 0 value = form_index + element.sequence_index < 0 || unsigned(form_index + element.sequence_index) >= c.forms->size() ? elementary_feature_empty : c.elementary_per_form[form_index + element.sequence_index].values[element.elementary_index];
0 0 value = form_index + element.sequence_index < 0 || unsigned(form_index + element.sequence_index) >= c.forms->size() ? elementary_feature_empty : c.elementary_per_form[form_index + element.sequence_index].values[element.elementary_index];
0 0 value = form_index + element.sequence_index < 0 || unsigned(form_index + element.sequence_index) >= c.forms->size() ? elementary_feature_empty : c.elementary_per_form[form_index + element.sequence_index].values[element.elementary_index];
458 17 value = form_index + element.sequence_index < 0 || unsigned(form_index + element.sequence_index) >= c.forms->size() ? elementary_feature_empty : c.elementary_per_form[form_index + element.sequence_index].values[element.elementary_index];
452 6 value = form_index + element.sequence_index < 0 || unsigned(form_index + element.sequence_index) >= c.forms->size() ? elementary_feature_empty : c.elementary_per_form[form_index + element.sequence_index].values[element.elementary_index];
0 0 value = form_index + element.sequence_index < 0 || unsigned(form_index + element.sequence_index) >= c.forms->size() ? elementary_feature_empty : c.elementary_per_form[form_index + element.sequence_index].values[element.elementary_index];
0 0 value = form_index + element.sequence_index < 0 || unsigned(form_index + element.sequence_index) >= c.forms->size() ? elementary_feature_empty : c.elementary_per_form[form_index + element.sequence_index].values[element.elementary_index];
9797 0 0 value = form_index + element.sequence_index < 0 ? elementary_feature_empty : c.window[-element.sequence_index]->values[element.elementary_index];
0 0 value = form_index + element.sequence_index < 0 ? elementary_feature_empty : c.window[-element.sequence_index]->values[element.elementary_index];
778 66 value = form_index + element.sequence_index < 0 ? elementary_feature_empty : c.window[-element.sequence_index]->values[element.elementary_index];
0 0 value = form_index + element.sequence_index < 0 ? elementary_feature_empty : c.window[-element.sequence_index]->values[element.elementary_index];
9804 0 0 if (value == elementary_feature_unknown) {
0 0 if (value == elementary_feature_unknown) {
174 1171 if (value == elementary_feature_unknown) {
0 0 if (value == elementary_feature_unknown) {
9813 0 0 if (!key_size) {
0 0 if (!key_size) {
174 479 if (!key_size) {
0 0 if (!key_size) {
9816 0 0 } else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) {
0 0 } else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) {
0 0 } else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) {
0 0 } else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) {
0 0 } else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) {
0 0 } else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) {
355 124 } else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) {
222 133 } else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) {
346 133 } else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) {
0 0 } else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) {
0 0 } else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) {
0 0 } else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) {
9833 0 0 for (unsigned i = 0; i < c.caches.size(); i++)
9877 0 0 cache(const viterbi& self) : features_cache(self.features) {}
0 0 cache(const viterbi& self) : features_cache(self.features) {}
1 0 cache(const viterbi& self) : features_cache(self.features) {}
0 0 cache(const viterbi& self) : features_cache(self.features) {}
9890 0 0 if (!forms.size()) return;
0 0 if (!forms.size()) return;
1 0 if (!forms.size()) return;
0 0 if (!forms.size()) return;
9894 0 0 for (unsigned i = 0, states = 1; i < forms.size(); i++) {
0 0 for (unsigned i = 0, states = 1; i < forms.size(); i++) {
7 1 for (unsigned i = 0, states = 1; i < forms.size(); i++) {
0 0 for (unsigned i = 0, states = 1; i < forms.size(); i++) {
9895 0 0 if (analyses[i].empty()) return;
0 0 if (analyses[i].empty()) return;
7 0 if (analyses[i].empty()) return;
0 0 if (analyses[i].empty()) return;
9896 0 0 states = (i+1 >= unsigned(decoding_order) ? states / analyses[i-decoding_order+1].size() : states) * analyses[i].size();
0 0 states = (i+1 >= unsigned(decoding_order) ? states / analyses[i-decoding_order+1].size() : states) * analyses[i].size();
5 2 states = (i+1 >= unsigned(decoding_order) ? states / analyses[i-decoding_order+1].size() : states) * analyses[i].size();
0 0 states = (i+1 >= unsigned(decoding_order) ? states / analyses[i-decoding_order+1].size() : states) * analyses[i].size();
9899 0 0 if (nodes > c.nodes.size()) c.nodes.resize(nodes);
0 0 if (nodes > c.nodes.size()) c.nodes.resize(nodes);
1 0 if (nodes > c.nodes.size()) c.nodes.resize(nodes);
0 0 if (nodes > c.nodes.size()) c.nodes.resize(nodes);
9905 0 0 int* window = window_size <= 16 ? window_stack : (window_heap.resize(window_size), window_heap.data());
0 0 int* window = window_size <= 16 ? window_stack : (window_heap.resize(window_size), window_heap.data());
0 0 int* window = window_size <= 16 ? window_stack : (window_heap.resize(window_size), window_heap.data());
0 0 int* window = window_size <= 16 ? window_stack : (window_heap.resize(window_size), window_heap.data());
0 1 int* window = window_size <= 16 ? window_stack : (window_heap.resize(window_size), window_heap.data());
0 0 int* window = window_size <= 16 ? window_stack : (window_heap.resize(window_size), window_heap.data());
0 0 int* window = window_size <= 16 ? window_stack : (window_heap.resize(window_size), window_heap.data());
0 0 int* window = window_size <= 16 ? window_stack : (window_heap.resize(window_size), window_heap.data());
9911 0 0 for (unsigned i = 0; i < forms.size(); i++) {
0 0 for (unsigned i = 0; i < forms.size(); i++) {
7 1 for (unsigned i = 0; i < forms.size(); i++) {
0 0 for (unsigned i = 0; i < forms.size(); i++) {
9914 0 0 for (int j = 0; j < window_size; j++) window[j] = -1;
0 0 for (int j = 0; j < window_size; j++) window[j] = -1;
7 21 for (int j = 0; j < window_size; j++) window[j] = -1;
0 0 for (int j = 0; j < window_size; j++) window[j] = -1;
9915 0 0 for (int tag = 0; tag < int(analyses[i].size()); tag++)
0 0 for (int tag = 0; tag < int(analyses[i].size()); tag++)
10 7 for (int tag = 0; tag < int(analyses[i].size()); tag++)
0 0 for (int tag = 0; tag < int(analyses[i].size()); tag++)
9916 0 0 for (int prev = nodes_prev; prev < nodes_now; prev++) {
0 0 for (int prev = nodes_prev; prev < nodes_now; prev++) {
15 10 for (int prev = nodes_prev; prev < nodes_now; prev++) {
0 0 for (int prev = nodes_prev; prev < nodes_now; prev++) {
9920 0 0 for (int p = prev, n = 1; p >= 0 && n < window_size; p = c.nodes[p].prev, n++) {
0 0 for (int p = prev, n = 1; p >= 0 && n < window_size; p = c.nodes[p].prev, n++) {
0 0 for (int p = prev, n = 1; p >= 0 && n < window_size; p = c.nodes[p].prev, n++) {
0 0 for (int p = prev, n = 1; p >= 0 && n < window_size; p = c.nodes[p].prev, n++) {
27 9 for (int p = prev, n = 1; p >= 0 && n < window_size; p = c.nodes[p].prev, n++) {
21 6 for (int p = prev, n = 1; p >= 0 && n < window_size; p = c.nodes[p].prev, n++) {
0 0 for (int p = prev, n = 1; p >= 0 && n < window_size; p = c.nodes[p].prev, n++) {
0 0 for (int p = prev, n = 1; p >= 0 && n < window_size; p = c.nodes[p].prev, n++) {
9921 0 0 same_tags += same_tags == n && window[n] == c.nodes[p].tag;
0 0 same_tags += same_tags == n && window[n] == c.nodes[p].tag;
0 0 same_tags += same_tags == n && window[n] == c.nodes[p].tag;
0 0 same_tags += same_tags == n && window[n] == c.nodes[p].tag;
7 14 same_tags += same_tags == n && window[n] == c.nodes[p].tag;
5 2 same_tags += same_tags == n && window[n] == c.nodes[p].tag;
0 0 same_tags += same_tags == n && window[n] == c.nodes[p].tag;
0 0 same_tags += same_tags == n && window[n] == c.nodes[p].tag;
9926 0 0 features.compute_dynamic_features(i, tag, prev >= 0 ? &c.nodes[prev].dynamic : nullptr, dynamic, c.features_cache);
0 0 features.compute_dynamic_features(i, tag, prev >= 0 ? &c.nodes[prev].dynamic : nullptr, dynamic, c.features_cache);
12 3 features.compute_dynamic_features(i, tag, prev >= 0 ? &c.nodes[prev].dynamic : nullptr, dynamic, c.features_cache);
0 0 features.compute_dynamic_features(i, tag, prev >= 0 ? &c.nodes[prev].dynamic : nullptr, dynamic, c.features_cache);
9927 0 0 score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) +
0 0 score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) +
0 0 score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) +
0 0 score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) +
0 0 score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) +
0 0 score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) +
7 8 score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) +
5 2 score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) +
12 3 score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) +
0 0 score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) +
0 0 score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) +
0 0 score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) +
0 0 score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) +
9931 0 0 if (same_tags >= decoding_order-1) {
0 0 if (same_tags >= decoding_order-1) {
2 13 if (same_tags >= decoding_order-1) {
0 0 if (same_tags >= decoding_order-1) {
9932 0 0 if (score <= c.nodes[nodes_next-1].score) continue;
0 0 if (score <= c.nodes[nodes_next-1].score) continue;
1 1 if (score <= c.nodes[nodes_next-1].score) continue;
0 0 if (score <= c.nodes[nodes_next-1].score) continue;
9947 0 0 for (int node = nodes_prev + 1; node < nodes_now; node++)
0 0 for (int node = nodes_prev + 1; node < nodes_now; node++)
1 1 for (int node = nodes_prev + 1; node < nodes_now; node++)
0 0 for (int node = nodes_prev + 1; node < nodes_now; node++)
9948 0 0 if (c.nodes[node].score > c.nodes[best].score)
0 0 if (c.nodes[node].score > c.nodes[best].score)
1 0 if (c.nodes[node].score > c.nodes[best].score)
0 0 if (c.nodes[node].score > c.nodes[best].score)
9951 0 0 for (int i = forms.size() - 1; i >= 0; i--, best = c.nodes[best].prev)
0 0 for (int i = forms.size() - 1; i >= 0; i--, best = c.nodes[best].prev)
7 1 for (int i = forms.size() - 1; i >= 0; i--, best = c.nodes[best].prev)
0 0 for (int i = forms.size() - 1; i >= 0; i--, best = c.nodes[best].prev)
10000 0 0 maps.resize(MAP_TOTAL);
1 0 maps.resize(MAP_TOTAL);
10004 2 0 vector conllu_elementary_features::descriptions = {
0 2 vector conllu_elementary_features::descriptions = {
68 2 vector conllu_elementary_features::descriptions = {
0 0 vector conllu_elementary_features::descriptions = {
10049 7 1 for (unsigned i = forms.size(); i--;) {
0 0 for (unsigned i = forms.size(); i--;) {
10053 10 7 for (unsigned j = 0; j < analyses[i].size(); j++) {
0 0 for (unsigned j = 0; j < analyses[i].size(); j++) {
10064 3 7 per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == lemma ? per_tag[i][j-1].values[LEMMA] :
0 3 per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == lemma ? per_tag[i][j-1].values[LEMMA] :
0 0 per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == lemma ? per_tag[i][j-1].values[LEMMA] :
0 0 per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == lemma ? per_tag[i][j-1].values[LEMMA] :
10069 0 10 if (index == string::npos) index = tag.size();
0 0 if (index == string::npos) index = tag.size();
10070 0 10 per_tag[i][j].values[TAG_UPOS] = maps[MAP_TAG_UPOS].value(tag.c_str() + (index ? 1 : 0), index - (index ? 1 : 0));
0 10 per_tag[i][j].values[TAG_UPOS] = maps[MAP_TAG_UPOS].value(tag.c_str() + (index ? 1 : 0), index - (index ? 1 : 0));
0 0 per_tag[i][j].values[TAG_UPOS] = maps[MAP_TAG_UPOS].value(tag.c_str() + (index ? 1 : 0), index - (index ? 1 : 0));
0 0 per_tag[i][j].values[TAG_UPOS] = maps[MAP_TAG_UPOS].value(tag.c_str() + (index ? 1 : 0), index - (index ? 1 : 0));
10072 10 0 if (index < tag.size()) index++;
0 0 if (index < tag.size()) index++;
10073 10 0 if (index < tag.size()) index = tag.find(separator, index);
0 0 if (index < tag.size()) index = tag.find(separator, index);
10074 10 0 if (index < tag.size()) index++;
0 0 if (index < tag.size()) index++;
10075 40 10 for (size_t length; index < tag.size(); index += length + 1) {
0 0 for (size_t length; index < tag.size(); index += length + 1) {
10077 6 34 length = (length == string::npos ? tag.size() : length) - index;
0 0 length = (length == string::npos ? tag.size() : length) - index;
10079 280 0 for (size_t equal_sign = 0; equal_sign + 1 < length; equal_sign++)
0 0 for (size_t equal_sign = 0; equal_sign + 1 < length; equal_sign++)
10080 240 40 if (tag[index + equal_sign] == '=') {
0 0 if (tag[index + equal_sign] == '=') {
10084 2 4 if (tag.compare(index, equal_sign, "Case") == 0) value = TAG_CASE, map = MAP_TAG_CASE;
0 0 if (tag.compare(index, equal_sign, "Case") == 0) value = TAG_CASE, map = MAP_TAG_CASE;
10087 2 14 if (tag.compare(index, equal_sign, "Gender") == 0) value = TAG_GENDER, map = MAP_TAG_GENDER;
0 0 if (tag.compare(index, equal_sign, "Gender") == 0) value = TAG_GENDER, map = MAP_TAG_GENDER;
10088 6 10 if (tag.compare(index, equal_sign, "Number") == 0) value = TAG_NUMBER, map = MAP_TAG_NUMBER;
0 0 if (tag.compare(index, equal_sign, "Number") == 0) value = TAG_NUMBER, map = MAP_TAG_NUMBER;
10089 4 12 if (tag.compare(index, equal_sign, "Person") == 0) value = TAG_PERSON, map = MAP_TAG_PERSON;
0 0 if (tag.compare(index, equal_sign, "Person") == 0) value = TAG_PERSON, map = MAP_TAG_PERSON;
10092 5 5 if (tag.compare(index, equal_sign, "Negative") == 0) value = TAG_NEGATIVE, map = MAP_TAG_NEGATIVE;
0 0 if (tag.compare(index, equal_sign, "Negative") == 0) value = TAG_NEGATIVE, map = MAP_TAG_NEGATIVE;
10096 19 21 if (value >= 0)
0 0 if (value >= 0)
10102 10 0 if (tag.size() >= 2 && tag[1] == 'V') {
6 4 if (tag.size() >= 2 && tag[1] == 'V') {
4 6 if (tag.size() >= 2 && tag[1] == 'V') {
0 0 if (tag.size() >= 2 && tag[1] == 'V') {
0 0 if (tag.size() >= 2 && tag[1] == 'V') {
0 0 if (tag.size() >= 2 && tag[1] == 'V') {
10104 2 2 verb_candidate = verb_candidate < 0 || (tag_compare = tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate;
2 0 verb_candidate = verb_candidate < 0 || (tag_compare = tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate;
0 0 verb_candidate = verb_candidate < 0 || (tag_compare = tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate;
0 0 verb_candidate = verb_candidate < 0 || (tag_compare = tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate;
10120 5 2 if (analyses[i].size() == 1) {
0 0 if (analyses[i].size() == 1) {
10128 0 2 } else if (forms[i].len <= 0) {
0 0 } else if (forms[i].len <= 0) {
10143 16 2 while (form.len) {
0 0 while (form.len) {
10147 16 0 num = num || cat & unicode::N;
16 0 num = num || cat & unicode::N;
0 0 num = num || cat & unicode::N;
0 0 num = num || cat & unicode::N;
10148 10 6 cap = cap || cat & unicode::Lut;
9 1 cap = cap || cat & unicode::Lut;
0 0 cap = cap || cat & unicode::Lut;
0 0 cap = cap || cat & unicode::Lut;
10149 16 0 dash = dash || cat & unicode::Pd;
16 0 dash = dash || cat & unicode::Pd;
0 0 dash = dash || cat & unicode::Pd;
0 0 dash = dash || cat & unicode::Pd;
10151 16 0 if (index == 10 || (!form.len && index < 10)) {
14 2 if (index == 10 || (!form.len && index < 10)) {
0 2 if (index == 10 || (!form.len && index < 10)) {
0 0 if (index == 10 || (!form.len && index < 10)) {
0 0 if (index == 10 || (!form.len && index < 10)) {
0 0 if (index == 10 || (!form.len && index < 10)) {
10181 12 3 if (prev_dynamic) {
0 0 if (prev_dynamic) {
10189 15 0 if (tag.tag.size() >= 2 && tag.tag[1] == 'V') {
11 4 if (tag.tag.size() >= 2 && tag.tag[1] == 'V') {
4 11 if (tag.tag.size() >= 2 && tag.tag[1] == 'V') {
0 0 if (tag.tag.size() >= 2 && tag.tag[1] == 'V') {
0 0 if (tag.tag.size() >= 2 && tag.tag[1] == 'V') {
0 0 if (tag.tag.size() >= 2 && tag.tag[1] == 'V') {
10243 0 0 maps.resize(MAP_TOTAL);
10279 0 0 for (unsigned i = forms.size(); i--;) {
10283 0 0 for (unsigned j = 0; j < analyses[i].size(); j++) {
10286 0 0 per_tag[i][j].values[TAG3] = analyses[i][j].tag.size() >= 3 ? maps[MAP_TAG3].value(analyses[i][j].tag.c_str() + 2, 1) : elementary_feature_empty;
10287 0 0 per_tag[i][j].values[TAG5] = analyses[i][j].tag.size() >= 5 ? maps[MAP_TAG5].value(analyses[i][j].tag.c_str() + 4, 1) : elementary_feature_empty;
10288 0 0 per_tag[i][j].values[TAG25] = analyses[i][j].tag.size() >= 5 ? maps[MAP_TAG25].value((tag25[0] = analyses[i][j].tag[1], tag25[1] = analyses[i][j].tag[4], tag25), 2) : elementary_feature_empty;
10289 0 0 per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == analyses[i][j].lemma ? per_tag[i][j-1].values[LEMMA] :
0 0 per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == analyses[i][j].lemma ? per_tag[i][j-1].values[LEMMA] :
10292 0 0 if (analyses[i][j].tag[0] == 'V') {
10294 0 0 verb_candidate = verb_candidate < 0 || (tag_compare = analyses[i][j].tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && analyses[i][j].lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate;
0 0 verb_candidate = verb_candidate < 0 || (tag_compare = analyses[i][j].tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && analyses[i][j].lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate;
10304 0 0 if (verb_candidate >= 0) {
10310 0 0 if (analyses[i].size() == 1) {
10314 0 0 } else if (forms[i].len <= 0) {
10325 0 0 while (form.len) {
10329 0 0 num = num || cat & unicode::N;
0 0 num = num || cat & unicode::N;
10330 0 0 cap = cap || cat & unicode::Lut;
0 0 cap = cap || cat & unicode::Lut;
10331 0 0 dash = dash || cat & unicode::Pd;
0 0 dash = dash || cat & unicode::Pd;
10333 0 0 if (index == 5 || (!form.len && index < 5)) {
0 0 if (index == 5 || (!form.len && index < 5)) {
0 0 if (index == 5 || (!form.len && index < 5)) {
10353 0 0 if (prev_dynamic) {
10361 0 0 if (tag.tag[0] == 'V') {
10415 0 0 maps.resize(MAP_TOTAL);
10463 0 0 for (unsigned i = forms.size(); i--;) {
10467 0 0 for (unsigned j = 0; j < analyses[i].size(); j++) {
10469 0 0 per_tag[i][j].values[TAG1] = analyses[i][j].tag.size() >= 1 ? maps[MAP_TAG1].value(analyses[i][j].tag.c_str() + 0, 1) : elementary_feature_empty;
10470 0 0 per_tag[i][j].values[TAG2] = analyses[i][j].tag.size() >= 2 ? maps[MAP_TAG2].value(analyses[i][j].tag.c_str() + 1, 1) : elementary_feature_empty;
10471 0 0 per_tag[i][j].values[TAG3] = analyses[i][j].tag.size() >= 3 ? maps[MAP_TAG3].value(analyses[i][j].tag.c_str() + 2, 1) : elementary_feature_empty;
10472 0 0 per_tag[i][j].values[TAG4] = analyses[i][j].tag.size() >= 4 ? maps[MAP_TAG4].value(analyses[i][j].tag.c_str() + 3, 1) : elementary_feature_empty;
10473 0 0 per_tag[i][j].values[TAG5] = analyses[i][j].tag.size() >= 5 ? maps[MAP_TAG5].value(analyses[i][j].tag.c_str() + 4, 1) : elementary_feature_empty;
10474 0 0 per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == analyses[i][j].lemma ? per_tag[i][j-1].values[LEMMA] :
0 0 per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == analyses[i][j].lemma ? per_tag[i][j-1].values[LEMMA] :
10477 0 0 if (analyses[i][j].tag[0] == 'V') {
10479 0 0 verb_candidate = verb_candidate < 0 || (tag_compare = analyses[i][j].tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && analyses[i][j].lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate;
0 0 verb_candidate = verb_candidate < 0 || (tag_compare = analyses[i][j].tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && analyses[i][j].lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate;
10489 0 0 if (verb_candidate >= 0) {
10495 0 0 if (analyses[i].size() == 1) {
10503 0 0 } else if (forms[i].len <= 0) {
10518 0 0 while (form.len) {
10522 0 0 num = num || cat & unicode::N;
0 0 num = num || cat & unicode::N;
10523 0 0 cap = cap || cat & unicode::Lut;
0 0 cap = cap || cat & unicode::Lut;
10524 0 0 dash = dash || cat & unicode::Pd;
0 0 dash = dash || cat & unicode::Pd;
10526 0 0 if (index == 10 || (!form.len && index < 10)) {
0 0 if (index == 10 || (!form.len && index < 10)) {
0 0 if (index == 10 || (!form.len && index < 10)) {
10556 0 0 if (prev_dynamic) {
10564 0 0 if (tag.tag[0] == 'V') {
10615 0 0 cache(const perceptron_tagger& self) : decoder_cache(self.decoder) {}
0 0 cache(const perceptron_tagger& self) : decoder_cache(self.decoder) {}
1 0 cache(const perceptron_tagger& self) : decoder_cache(self.decoder) {}
10629 1 0 if (dict.reset(morpho::load(is)), !dict) return false;
0 0 if (dict.reset(morpho::load(is)), !dict) return false;
0 0 if (dict.reset(morpho::load(is)), !dict) return false;
10631 1 0 if (!features.load(is)) return false;
0 0 if (!features.load(is)) return false;
0 0 if (!features.load(is)) return false;
10643 0 0 if (!dict) return;
0 0 if (!dict) return;
1 0 if (!dict) return;
10646 0 0 if (!c) c = new cache(*this);
0 0 if (!c) c = new cache(*this);
0 0 if (!c) c = new cache(*this);
0 0 if (!c) c = new cache(*this);
1 0 if (!c) c = new cache(*this);
1 0 if (!c) c = new cache(*this);
10649 0 0 if (c->analyses.size() < forms.size()) c->analyses.resize(forms.size());
0 0 if (c->analyses.size() < forms.size()) c->analyses.resize(forms.size());
1 0 if (c->analyses.size() < forms.size()) c->analyses.resize(forms.size());
10650 0 0 for (unsigned i = 0; i < forms.size(); i++) {
0 0 for (unsigned i = 0; i < forms.size(); i++) {
7 1 for (unsigned i = 0; i < forms.size(); i++) {
10653 0 0 dict->analyze(forms[i], guesser >= 0 ? guesser : use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, c->analyses[i]);
0 0 dict->analyze(forms[i], guesser >= 0 ? guesser : use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, c->analyses[i]);
0 0 dict->analyze(forms[i], guesser >= 0 ? guesser : use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, c->analyses[i]);
0 0 dict->analyze(forms[i], guesser >= 0 ? guesser : use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, c->analyses[i]);
7 0 dict->analyze(forms[i], guesser >= 0 ? guesser : use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, c->analyses[i]);
0 7 dict->analyze(forms[i], guesser >= 0 ? guesser : use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, c->analyses[i]);
10656 0 0 if (c->tags.size() < forms.size()) c->tags.resize(forms.size() * 2);
0 0 if (c->tags.size() < forms.size()) c->tags.resize(forms.size() * 2);
1 0 if (c->tags.size() < forms.size()) c->tags.resize(forms.size() * 2);
10659 0 0 for (unsigned i = 0; i < forms.size(); i++)
0 0 for (unsigned i = 0; i < forms.size(); i++)
7 1 for (unsigned i = 0; i < forms.size(); i++)
10670 0 0 if (!c) c = new cache(*this);
0 0 if (!c) c = new cache(*this);
0 0 if (!c) c = new cache(*this);
0 0 if (!c) c = new cache(*this);
0 0 if (!c) c = new cache(*this);
0 0 if (!c) c = new cache(*this);
10696 1 0 tagger_id id = tagger_id(is.get());
1 0 tagger_id id = tagger_id(is.get());
1 0 tagger_id id = tagger_id(is.get());
1 0 tagger_id id = tagger_id(is.get());
0 0 tagger_id id = tagger_id(is.get());
0 0 tagger_id id = tagger_id(is.get());
0 0 tagger_id id = tagger_id(is.get());
10702 0 0 auto res = new_unique_ptr>>(tagger_ids::decoding_order(id), tagger_ids::window_size(id));
10703 0 0 if (res->load(is)) return res.release();
0 0 if (res->load(is)) return res.release();
10711 0 0 auto res = new_unique_ptr>>(tagger_ids::decoding_order(id), tagger_ids::window_size(id));
10712 0 0 if (res->load(is)) return res.release();
0 0 if (res->load(is)) return res.release();
10719 1 0 auto res = new_unique_ptr>>(tagger_ids::decoding_order(id), tagger_ids::window_size(id));
10720 1 0 if (res->load(is)) return res.release();
1 0 if (res->load(is)) return res.release();
10729 0 0 ifstream f(path_from_utf8(fname).c_str(), ifstream::binary);
10730 0 0 if (!f) return nullptr;
10732 0 0 return load(f);
10737 0 0 return morpho ? morpho->new_tokenizer() : nullptr;
10842 0 0 for (int i = 0; i < 15 && pdt_tag[i]; i++)
0 0 for (int i = 0; i < 15 && pdt_tag[i]; i++)
10843 0 0 if (pdt_tag[i] != '-') {
10844 0 0 if (!tag.empty()) tag.push_back('|');
10851 0 0 for (unsigned i = 0; i + 2 < lemma.size(); i++)
10852 0 0 if (lemma[i] == '_' && lemma[i + 1] == ';') {
0 0 if (lemma[i] == '_' && lemma[i + 1] == ';') {
0 0 if (lemma[i] == '_' && lemma[i + 1] == ';') {
10853 0 0 if (!tag.empty()) tag.push_back('|');
10862 0 0 return raw_lemma < lemma.size() ? (lemma.resize(raw_lemma), true) : false;
10873 0 0 for (auto&& tagged_lemma : tagged_lemmas) {
10879 0 0 if (!lemma_changed || tagged_lemmas.size() < 2) return;
0 0 if (!lemma_changed || tagged_lemmas.size() < 2) return;
0 0 if (!lemma_changed || tagged_lemmas.size() < 2) return;
10887 0 0 for (auto&& tagged_lemma_forms : forms) {
10888 0 0 for (auto&& tagged_form : tagged_lemma_forms.forms)
10894 0 0 if (!lemma_changed || forms.size() < 2) return;
0 0 if (!lemma_changed || forms.size() < 2) return;
0 0 if (!lemma_changed || forms.size() < 2) return;
10948 0 0 return lemma_id_len < lemma.size() ? (lemma.resize(lemma_id_len), true) : false;
10958 0 0 for (auto&& tagged_lemma : tagged_lemmas)
10962 0 0 if (!lemma_changed || tagged_lemmas.size() < 2) return;
0 0 if (!lemma_changed || tagged_lemmas.size() < 2) return;
0 0 if (!lemma_changed || tagged_lemmas.size() < 2) return;
10970 0 0 for (auto&& tagged_lemma_forms : forms)
10974 0 0 if (!lemma_changed || forms.size() < 2) return;
0 0 if (!lemma_changed || forms.size() < 2) return;
0 0 if (!lemma_changed || forms.size() < 2) return;
11028 0 0 return raw_lemma_len < lemma.size() ? (lemma.resize(raw_lemma_len), true) : false;
11038 0 0 for (auto&& tagged_lemma : tagged_lemmas)
11042 0 0 if (!lemma_changed || tagged_lemmas.size() < 2) return;
0 0 if (!lemma_changed || tagged_lemmas.size() < 2) return;
0 0 if (!lemma_changed || tagged_lemmas.size() < 2) return;
11050 0 0 for (auto&& tagged_lemma_forms : forms)
11054 0 0 if (!lemma_changed || forms.size() < 2) return;
0 0 if (!lemma_changed || forms.size() < 2) return;
0 0 if (!lemma_changed || forms.size() < 2) return;
11093 0 0 if (name == "pdt_to_conll2009") return tagset_converter::new_pdt_to_conll2009_converter();
11094 0 0 if (name == "strip_lemma_comment") return tagset_converter::new_strip_lemma_comment_converter(dictionary);
11095 0 0 if (name == "strip_lemma_id") return tagset_converter::new_strip_lemma_id_converter(dictionary);
11102 0 0 inline static bool eq(const tagged_lemma& a, const tagged_lemma& b) { return a.lemma == b.lemma && a.tag == b.tag; }
0 0 inline static bool eq(const tagged_lemma& a, const tagged_lemma& b) { return a.lemma == b.lemma && a.tag == b.tag; }
11103 0 0 inline static bool lt(const tagged_lemma& a, const tagged_lemma& b) { int lemma_compare = a.lemma.compare(b.lemma); return lemma_compare < 0 || (lemma_compare == 0 && a.tag < b.tag); }
11112 0 0 for (unsigned i = 0; i < forms.size(); i++) {
11114 0 0 for (unsigned j = forms.size() - 1; j > i; j--)
11115 0 0 if (forms[j].lemma == forms[i].lemma) {
11117 0 0 for (auto&& tagged_form : forms[j].forms)
11121 0 0 if (j < forms.size() - 1) {
11129 0 0 if (any_merged && forms[i].forms.size() > 1) {
0 0 if (any_merged && forms[i].forms.size() > 1) {
0 0 if (any_merged && forms[i].forms.size() > 1) {
11132 0 0 inline static bool eq(const tagged_form& a, const tagged_form& b) { return a.tag == b.tag && a.form == b.form; }
0 0 inline static bool eq(const tagged_form& a, const tagged_form& b) { return a.tag == b.tag && a.form == b.form; }
11133 0 0 inline static bool lt(const tagged_form& a, const tagged_form& b) { int tag_compare = a.tag.compare(b.tag); return tag_compare < 0 || (tag_compare == 0 && a.form < b.form); }
11291 214 2 const unordered_set czech_tokenizer::abbreviations_czech = {
0 0 const unordered_set czech_tokenizer::abbreviations_czech = {
11307 206 2 const unordered_set czech_tokenizer::abbreviations_slovak = {
0 0 const unordered_set czech_tokenizer::abbreviations_slovak = {
11324 0 0 : ragel_tokenizer(version <= 1 ? 1 : 2), m(m) {
0 0 : ragel_tokenizer(version <= 1 ? 1 : 2), m(m) {
11338 0 0 if (!m) return;
11339 0 0 if (tokens.empty() || chars[tokens.back().start].cat & ~unicode::L) return;
0 0 if (tokens.empty() || chars[tokens.back().start].cat & ~unicode::L) return;
0 0 if (tokens.empty() || chars[tokens.back().start].cat & ~unicode::L) return;
11342 0 0 for (unsigned hyphens = 1; hyphens <= 2; hyphens++) {
11344 0 0 if (tokens.size() < 2*hyphens + 1) break;
11346 0 0 if (tokens[first_hyphen].length != 1 || chars[tokens[first_hyphen].start].cat & ~unicode::P ||
0 0 if (tokens[first_hyphen].length != 1 || chars[tokens[first_hyphen].start].cat & ~unicode::P ||
0 0 if (tokens[first_hyphen].length != 1 || chars[tokens[first_hyphen].start].cat & ~unicode::P ||
11347 0 0 tokens[first_hyphen].start + tokens[first_hyphen].length != tokens[first_hyphen + 1].start ||
11348 0 0 tokens[first_hyphen-1].start + tokens[first_hyphen-1].length != tokens[first_hyphen].start ||
0 0 tokens[first_hyphen-1].start + tokens[first_hyphen-1].length != tokens[first_hyphen].start ||
11352 0 0 if (m->analyze(string_piece(chars[tokens[first_hyphen-1].start].str, chars[tokens.back().start + tokens.back().length].str - chars[tokens[first_hyphen-1].start].str), morpho::NO_GUESSER, lemmas) >= 0)
11356 0 0 if (matched_hyphens) {
11370 0 0 while (tokenize_url_email(tokens))
11371 0 0 if (emergency_sentence_split(tokens))
11387 0 0 if ( ( current) == ( (chars.size() - 1)) )
11392 0 0 switch ( _czech_tokenizer_from_state_actions[cs] ) {
11401 0 0 if ( _klen > 0 ) {
11406 0 0 if ( _upper < _lower )
11410 0 0 if ( _widec < _mid[0] )
11412 0 0 else if ( _widec > _mid[1] )
11418 0 0 if (
11419 0 0 !current || (chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N | unicode::Pd)) ) _widec += 256;
0 0 !current || (chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N | unicode::Pd)) ) _widec += 256;
11424 0 0 if (
11425 0 0 !current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256;
0 0 !current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256;
0 0 !current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256;
11438 0 0 if ( _klen > 0 ) {
11443 0 0 if ( _upper < _lower )
11447 0 0 if ( _widec < *_mid )
11449 0 0 else if ( _widec > *_mid )
11461 0 0 if ( _klen > 0 ) {
11466 0 0 if ( _upper < _lower )
11470 0 0 if ( _widec < _mid[0] )
11472 0 0 else if ( _widec > _mid[1] )
11487 0 0 if ( _czech_tokenizer_trans_actions[_trans] == 0 )
11501 0 0 do
11502 0 0 if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
11510 0 0 for (current = ts; current < whitespace; current++)
11513 0 0 if (eos) {( current)++; goto _out; }
11518 0 0 if (!tokens.empty()) {( current)++; goto _out; }
11520 0 0 do
11521 0 0 if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
11530 0 0 do
11531 0 0 if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
11539 0 0 do
11540 0 0 if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
11547 0 0 if (!tokens.empty()) {( current)++; goto _out; }
11549 0 0 do
11550 0 0 if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
11559 0 0 do
11560 0 0 if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
11568 0 0 switch ( _czech_tokenizer_to_state_actions[cs] ) {
11574 0 0 if ( cs == 0 )
11576 0 0 if ( ++( current) != ( (chars.size() - 1)) )
11579 0 0 if ( ( current) == ( (chars.size() - 1)) )
11581 0 0 if ( _czech_tokenizer_eof_trans[cs] > 0 ) {
11641 0 0 return new czech_tokenizer(language, version, m);
11648 0 0 return bool(is) && (language == czech_tokenizer::CZECH || language == czech_tokenizer::SLOVAK);
0 0 return bool(is) && (language == czech_tokenizer::CZECH || language == czech_tokenizer::SLOVAK);
11713 228 2 const unordered_set english_tokenizer::abbreviations = {
0 0 const unordered_set english_tokenizer::abbreviations = {
11812 0 0 if (tokens.empty() || chars[tokens.back().start].cat & ~unilib::unicode::L) return;
0 0 if (tokens.empty() || chars[tokens.back().start].cat & ~unilib::unicode::L) return;
0 0 if (tokens.empty() || chars[tokens.back().start].cat & ~unilib::unicode::L) return;
11827 0 0 if ( ( index) == ( end) )
11836 0 0 if ( _klen > 0 ) {
11841 0 0 if ( _upper < _lower )
11845 0 0 if ( ( ragel_char(chars[tokens.back().start + end - index - 1])) < *_mid )
11847 0 0 else if ( ( ragel_char(chars[tokens.back().start + end - index - 1])) > *_mid )
11859 0 0 if ( _klen > 0 ) {
11864 0 0 if ( _upper < _lower )
11868 0 0 if ( ( ragel_char(chars[tokens.back().start + end - index - 1])) < _mid[0] )
11870 0 0 else if ( ( ragel_char(chars[tokens.back().start + end - index - 1])) > _mid[1] )
11884 0 0 if ( _english_tokenizer_split_token_trans_actions[_trans] == 0 )
11898 0 0 if ( cs == 0 )
11900 0 0 if ( ++( index) != ( end) )
11903 0 0 if ( ( index) == ( end) )
11905 0 0 switch ( _english_tokenizer_split_token_eof_actions[cs] ) {
11915 0 0 if (split_len && split_len < end) {
12069 0 0 english_tokenizer::english_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {}
0 0 english_tokenizer::english_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {}
0 0 english_tokenizer::english_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {}
0 0 english_tokenizer::english_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {}
12078 0 0 while (tokenize_url_email(tokens))
12079 0 0 if (emergency_sentence_split(tokens))
12095 0 0 if ( ( current) == ( (chars.size() - 1)) )
12100 0 0 switch ( _english_tokenizer_from_state_actions[cs] ) {
12109 0 0 if ( _klen > 0 ) {
12114 0 0 if ( _upper < _lower )
12118 0 0 if ( _widec < _mid[0] )
12120 0 0 else if ( _widec > _mid[1] )
12126 0 0 if (
12127 0 0 !current || (chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N | unicode::Pd)) ) _widec += 256;
0 0 !current || (chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N | unicode::Pd)) ) _widec += 256;
12132 0 0 if (
12133 0 0 !current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256;
0 0 !current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256;
0 0 !current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256;
12146 0 0 if ( _klen > 0 ) {
12151 0 0 if ( _upper < _lower )
12155 0 0 if ( _widec < *_mid )
12157 0 0 else if ( _widec > *_mid )
12169 0 0 if ( _klen > 0 ) {
12174 0 0 if ( _upper < _lower )
12178 0 0 if ( _widec < _mid[0] )
12180 0 0 else if ( _widec > _mid[1] )
12195 0 0 if ( _english_tokenizer_trans_actions[_trans] == 0 )
12209 0 0 do
12210 0 0 if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
12218 0 0 for (current = ts; current < whitespace; current++)
12221 0 0 if (eos) {( current)++; goto _out; }
12226 0 0 if (!tokens.empty()) {( current)++; goto _out; }
12228 0 0 do
12229 0 0 if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
12238 0 0 do
12239 0 0 if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
12247 0 0 do
12248 0 0 if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
12255 0 0 if (!tokens.empty()) {( current)++; goto _out; }
12257 0 0 do
12258 0 0 if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
12267 0 0 do
12268 0 0 if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
12276 0 0 switch ( _english_tokenizer_to_state_actions[cs] ) {
12282 0 0 if ( cs == 0 )
12284 0 0 if ( ++( current) != ( (chars.size() - 1)) )
12287 0 0 if ( ( current) == ( (chars.size() - 1)) )
12289 0 0 if ( _english_tokenizer_eof_trans[cs] > 0 ) {
12446 0 0 generic_tokenizer::generic_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {}
0 0 generic_tokenizer::generic_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {}
0 0 generic_tokenizer::generic_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {}
0 0 generic_tokenizer::generic_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {}
12455 0 0 while (tokenize_url_email(tokens))
12456 0 0 if (emergency_sentence_split(tokens))
12472 0 0 if ( ( current) == ( (chars.size() - 1)) )
12477 0 0 switch ( _generic_tokenizer_from_state_actions[cs] ) {
12486 0 0 if ( _klen > 0 ) {
12491 0 0 if ( _upper < _lower )
12495 0 0 if ( _widec < _mid[0] )
12497 0 0 else if ( _widec > _mid[1] )
12503 0 0 if (
12504 0 0 !current || (chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N | unicode::Pd)) ) _widec += 256;
0 0 !current || (chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N | unicode::Pd)) ) _widec += 256;
12509 0 0 if (
12510 0 0 !current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256;
0 0 !current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256;
0 0 !current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256;
12523 0 0 if ( _klen > 0 ) {
12528 0 0 if ( _upper < _lower )
12532 0 0 if ( _widec < *_mid )
12534 0 0 else if ( _widec > *_mid )
12546 0 0 if ( _klen > 0 ) {
12551 0 0 if ( _upper < _lower )
12555 0 0 if ( _widec < _mid[0] )
12557 0 0 else if ( _widec > _mid[1] )
12572 0 0 if ( _generic_tokenizer_trans_actions[_trans] == 0 )
12585 0 0 do
12586 0 0 if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
12594 0 0 for (current = ts; current < whitespace; current++)
12597 0 0 if (eos) {( current)++; goto _out; }
12602 0 0 if (!tokens.empty()) {( current)++; goto _out; }
12604 0 0 do
12605 0 0 if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
12613 0 0 do
12614 0 0 if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
12622 0 0 do
12623 0 0 if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
12630 0 0 if (!tokens.empty()) {( current)++; goto _out; }
12632 0 0 do
12633 0 0 if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
12641 0 0 do
12642 0 0 if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
12650 0 0 switch ( _generic_tokenizer_to_state_actions[cs] ) {
12656 0 0 if ( cs == 0 )
12658 0 0 if ( ++( current) != ( (chars.size() - 1)) )
12661 0 0 if ( ( current) == ( (chars.size() - 1)) )
12663 0 0 if ( _generic_tokenizer_eof_trans[cs] > 0 ) {
12726 0 0 version = is.get();
12771 0 0 os.put(version);
12908 2 2 for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f);
0 0 for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f);
0 0 for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f);
0 0 for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f);
0 0 for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f);
0 0 for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f);
0 0 for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f);
0 0 for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f);
0 0 for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f);
0 0 for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f);
0 0 for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f);
0 0 for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f);
0 0 for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f);
0 0 for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f);
0 0 for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f);
12914 0 0 for (int i = 0; i < R; i++) memcpy(w[i], data.next(C), sizeof(float) * C);
0 0 for (int i = 0; i < R; i++) memcpy(w[i], data.next(C), sizeof(float) * C);
192 12 for (int i = 0; i < R; i++) memcpy(w[i], data.next(C), sizeof(float) * C);
0 0 for (int i = 0; i < R; i++) memcpy(w[i], data.next(C), sizeof(float) * C);
0 0 for (int i = 0; i < R; i++) memcpy(w[i], data.next(C), sizeof(float) * C);
6 2 for (int i = 0; i < R; i++) memcpy(w[i], data.next(C), sizeof(float) * C);
12930 1 0 if (chars.empty()) return;
0 0 if (chars.empty()) return;
0 0 if (chars.empty()) return;
12934 34 1 for (size_t i = 0; i < chars.size(); i++) {
0 0 for (size_t i = 0; i < chars.size(); i++) {
0 0 for (size_t i = 0; i < chars.size(); i++) {
12942 0 0 if (decomposition[0] == 0x3001) decomposition[0] = char32_t(',');
0 0 if (decomposition[0] == 0x3001) decomposition[0] = char32_t(',');
0 0 if (decomposition[0] == 0x3001) decomposition[0] = char32_t(',');
12943 0 0 if (decomposition[0] == 0x3002) decomposition[0] = char32_t('.');
0 0 if (decomposition[0] == 0x3002) decomposition[0] = char32_t('.');
0 0 if (decomposition[0] == 0x3002) decomposition[0] = char32_t('.');
12944 0 0 if (decomposition[0] != chars[i].chr) embedding = embeddings.find(decomposition[0]);
0 0 if (decomposition[0] != chars[i].chr) embedding = embeddings.find(decomposition[0]);
0 0 if (decomposition[0] != chars[i].chr) embedding = embeddings.find(decomposition[0]);
12947 34 0 if (embedding != embeddings.end()) {
0 0 if (embedding != embeddings.end()) {
0 0 if (embedding != embeddings.end()) {
12952 0 0 outcomes[i].embedding = embedding != embeddings.end() ? embedding->second.cache.w[0] : empty_embedding.cache.w[0];
0 0 outcomes[i].embedding = embedding != embeddings.end() ? embedding->second.cache.w[0] : empty_embedding.cache.w[0];
0 0 outcomes[i].embedding = embedding != embeddings.end() ? embedding->second.cache.w[0] : empty_embedding.cache.w[0];
12957 34 1 for (auto&& outcome : outcomes)
0 0 for (auto&& outcome : outcomes)
0 0 for (auto&& outcome : outcomes)
12958 102 34 for (int i = 0; i < 3; i++)
0 0 for (int i = 0; i < 3; i++)
0 0 for (int i = 0; i < 3; i++)
12963 2 1 for (int dir = 0; dir < 2; dir++) {
0 0 for (int dir = 0; dir < 2; dir++) {
0 0 for (int dir = 0; dir < 2; dir++) {
12964 1 1 auto& gru = dir == 0 ? gru_fwd : gru_bwd;
0 0 auto& gru = dir == 0 ? gru_fwd : gru_bwd;
0 0 auto& gru = dir == 0 ? gru_fwd : gru_bwd;
12965 1 1 auto& projection = dir == 0 ? projection_fwd : projection_bwd;
0 0 auto& projection = dir == 0 ? projection_fwd : projection_bwd;
0 0 auto& projection = dir == 0 ? projection_fwd : projection_bwd;
12968 68 2 for (size_t i = 0; i < outcomes.size(); i++) {
0 0 for (size_t i = 0; i < outcomes.size(); i++) {
0 0 for (size_t i = 0; i < outcomes.size(); i++) {
12969 34 34 auto& outcome = outcomes[dir == 0 ? i : outcomes.size() - 1 - i];
0 0 auto& outcome = outcomes[dir == 0 ? i : outcomes.size() - 1 - i];
0 0 auto& outcome = outcomes[dir == 0 ? i : outcomes.size() - 1 - i];
12970 34 34 auto* embedding_cache = outcome.embedding + (dir == 1) * 3 * D;
0 0 auto* embedding_cache = outcome.embedding + (dir == 1) * 3 * D;
0 0 auto* embedding_cache = outcome.embedding + (dir == 1) * 3 * D;
12972 68 1088 for (int j = 0; j < D; j++) {
0 0 for (int j = 0; j < D; j++) {
0 0 for (int j = 0; j < D; j++) {
12975 17408 1088 for (int k = 0; k < D; k++) {
0 0 for (int k = 0; k < D; k++) {
0 0 for (int k = 0; k < D; k++) {
12983 68 1088 for (int j = 0; j < D; j++) {
0 0 for (int j = 0; j < D; j++) {
0 0 for (int j = 0; j < D; j++) {
12985 17408 1088 for (int k = 0; k < D; k++)
0 0 for (int k = 0; k < D; k++)
0 0 for (int k = 0; k < D; k++)
12991 204 68 for (int j = 0; j < 3; j++)
0 0 for (int j = 0; j < 3; j++)
0 0 for (int j = 0; j < 3; j++)
12992 3264 204 for (int k = 0; k < D; k++)
0 0 for (int k = 0; k < D; k++)
0 0 for (int k = 0; k < D; k++)
12998 34 1 for (auto&& outcome : outcomes) {
0 0 for (auto&& outcome : outcomes) {
0 0 for (auto&& outcome : outcomes) {
13000 1 33 if (outcome.w[2] > outcome.w[outcome.outcome]) outcome.outcome = 2;
0 0 if (outcome.w[2] > outcome.w[outcome.outcome]) outcome.outcome = 2;
0 0 if (outcome.w[2] > outcome.w[outcome.outcome]) outcome.outcome = 2;
13008 0 0 for (unsigned chars = data.next_4B(); chars; chars--) {
0 0 for (unsigned chars = data.next_4B(); chars; chars--) {
0 0 for (unsigned chars = data.next_4B(); chars; chars--) {
0 0 for (unsigned chars = data.next_4B(); chars; chars--) {
1 0 for (unsigned chars = data.next_4B(); chars; chars--) {
20 1 for (unsigned chars = data.next_4B(); chars; chars--) {
13009 0 0 auto& embedding = network->embeddings[data.next_4B()];
0 0 auto& embedding = network->embeddings[data.next_4B()];
20 0 auto& embedding = network->embeddings[data.next_4B()];
13010 0 0 copy_n(data.next(D), D, embedding.e.w[0]);
0 0 copy_n(data.next(D), D, embedding.e.w[0]);
20 0 copy_n(data.next(D), D, embedding.e.w[0]);
13014 0 0 network->gru_fwd.load(data);
0 0 network->gru_fwd.load(data);
1 0 network->gru_fwd.load(data);
13015 0 0 network->gru_bwd.load(data);
0 0 network->gru_bwd.load(data);
1 0 network->gru_bwd.load(data);
13016 0 0 network->projection_fwd.load(data);
0 0 network->projection_fwd.load(data);
1 0 network->projection_fwd.load(data);
13017 0 0 network->projection_bwd.load(data);
0 0 network->projection_bwd.load(data);
1 0 network->projection_bwd.load(data);
13020 0 0 for (unsigned unknown_chars_len = data.next_1B(); unknown_chars_len; unknown_chars_len--) {
0 0 for (unsigned unknown_chars_len = data.next_1B(); unknown_chars_len; unknown_chars_len--) {
0 0 for (unsigned unknown_chars_len = data.next_1B(); unknown_chars_len; unknown_chars_len--) {
0 0 for (unsigned unknown_chars_len = data.next_1B(); unknown_chars_len; unknown_chars_len--) {
1 0 for (unsigned unknown_chars_len = data.next_1B(); unknown_chars_len; unknown_chars_len--) {
4 1 for (unsigned unknown_chars_len = data.next_1B(); unknown_chars_len; unknown_chars_len--) {
13021 0 0 unilib::unicode::category_t cat = data.next_4B();
0 0 unilib::unicode::category_t cat = data.next_4B();
4 0 unilib::unicode::category_t cat = data.next_4B();
13022 0 0 network->unknown_chars[cat] = data.next_4B();
0 0 network->unknown_chars[cat] = data.next_4B();
4 0 network->unknown_chars[cat] = data.next_4B();
13032 0 0 for (auto&& embedding : embeddings) {
0 0 for (auto&& embedding : embeddings) {
20 1 for (auto&& embedding : embeddings) {
13036 0 0 for (int i = 0; i < 6; i++) fill_n(cache.w[i], D, 0.f);
0 0 for (int i = 0; i < 6; i++) fill_n(cache.w[i], D, 0.f);
20 120 for (int i = 0; i < 6; i++) fill_n(cache.w[i], D, 0.f);
13037 0 0 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[0][i] += e.w[0][j] * gru_fwd.X.w[i][j];
0 0 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[0][i] += e.w[0][j] * gru_fwd.X.w[i][j];
0 0 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[0][i] += e.w[0][j] * gru_fwd.X.w[i][j];
0 0 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[0][i] += e.w[0][j] * gru_fwd.X.w[i][j];
320 20 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[0][i] += e.w[0][j] * gru_fwd.X.w[i][j];
5120 320 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[0][i] += e.w[0][j] * gru_fwd.X.w[i][j];
13038 0 0 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[1][i] += e.w[0][j] * gru_fwd.X_r.w[i][j];
0 0 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[1][i] += e.w[0][j] * gru_fwd.X_r.w[i][j];
0 0 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[1][i] += e.w[0][j] * gru_fwd.X_r.w[i][j];
0 0 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[1][i] += e.w[0][j] * gru_fwd.X_r.w[i][j];
320 20 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[1][i] += e.w[0][j] * gru_fwd.X_r.w[i][j];
5120 320 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[1][i] += e.w[0][j] * gru_fwd.X_r.w[i][j];
13039 0 0 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[2][i] += e.w[0][j] * gru_fwd.X_z.w[i][j];
0 0 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[2][i] += e.w[0][j] * gru_fwd.X_z.w[i][j];
0 0 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[2][i] += e.w[0][j] * gru_fwd.X_z.w[i][j];
0 0 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[2][i] += e.w[0][j] * gru_fwd.X_z.w[i][j];
320 20 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[2][i] += e.w[0][j] * gru_fwd.X_z.w[i][j];
5120 320 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[2][i] += e.w[0][j] * gru_fwd.X_z.w[i][j];
13040 0 0 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[3][i] += e.w[0][j] * gru_bwd.X.w[i][j];
0 0 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[3][i] += e.w[0][j] * gru_bwd.X.w[i][j];
0 0 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[3][i] += e.w[0][j] * gru_bwd.X.w[i][j];
0 0 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[3][i] += e.w[0][j] * gru_bwd.X.w[i][j];
320 20 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[3][i] += e.w[0][j] * gru_bwd.X.w[i][j];
5120 320 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[3][i] += e.w[0][j] * gru_bwd.X.w[i][j];
13041 0 0 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[4][i] += e.w[0][j] * gru_bwd.X_r.w[i][j];
0 0 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[4][i] += e.w[0][j] * gru_bwd.X_r.w[i][j];
0 0 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[4][i] += e.w[0][j] * gru_bwd.X_r.w[i][j];
0 0 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[4][i] += e.w[0][j] * gru_bwd.X_r.w[i][j];
320 20 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[4][i] += e.w[0][j] * gru_bwd.X_r.w[i][j];
5120 320 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[4][i] += e.w[0][j] * gru_bwd.X_r.w[i][j];
13042 0 0 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[5][i] += e.w[0][j] * gru_bwd.X_z.w[i][j];
0 0 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[5][i] += e.w[0][j] * gru_bwd.X_z.w[i][j];
0 0 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[5][i] += e.w[0][j] * gru_bwd.X_z.w[i][j];
0 0 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[5][i] += e.w[0][j] * gru_bwd.X_z.w[i][j];
320 20 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[5][i] += e.w[0][j] * gru_bwd.X_z.w[i][j];
5120 320 for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[5][i] += e.w[0][j] * gru_bwd.X_z.w[i][j];
13044 0 0 for (int i = 0; i < 6; i++) fill_n(empty_embedding.cache.w[i], D, 0.f);
0 0 for (int i = 0; i < 6; i++) fill_n(empty_embedding.cache.w[i], D, 0.f);
6 1 for (int i = 0; i < 6; i++) fill_n(empty_embedding.cache.w[i], D, 0.f);
13067 0 0 : unicode_tokenizer(url_email_tokenizer), segment(segment), allow_spaces(allow_spaces), network_index(0), network_length(0), network(network) {}
0 0 : unicode_tokenizer(url_email_tokenizer), segment(segment), allow_spaces(allow_spaces), network_index(0), network_length(0), network(network) {}
0 0 : unicode_tokenizer(url_email_tokenizer), segment(segment), allow_spaces(allow_spaces), network_index(0), network_length(0), network(network) {}
1 0 : unicode_tokenizer(url_email_tokenizer), segment(segment), allow_spaces(allow_spaces), network_index(0), network_length(0), network(network) {}
13102 29 5 return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t';
29 0 return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t';
29 0 return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t';
0 29 return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t';
4 0 return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t';
4 0 return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t';
4 0 return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t';
0 4 return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t';
28 5 return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t';
28 0 return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t';
28 0 return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t';
0 28 return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t';
7 5 return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t';
7 0 return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t';
7 0 return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t';
0 7 return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t';
13109 1 1 if (current == 0) network_index = network_length = 0;
13112 8 1 for (bool eos = false; !eos && !emergency_sentence_split(tokens); ) {
0 8 for (bool eos = false; !eos && !emergency_sentence_split(tokens); ) {
8 1 for (bool eos = false; !eos && !emergency_sentence_split(tokens); ) {
13113 12 1 while (current < chars.size() - 1 && is_space(current))
7 5 while (current < chars.size() - 1 && is_space(current))
5 8 while (current < chars.size() - 1 && is_space(current))
13114 0 5 if (next_outcome() == gru_tokenizer_network::END_OF_SENTENCE && !tokens.empty())
0 0 if (next_outcome() == gru_tokenizer_network::END_OF_SENTENCE && !tokens.empty())
5 0 if (next_outcome() == gru_tokenizer_network::END_OF_SENTENCE && !tokens.empty())
13117 7 1 if (current >= chars.size() - 1) break;
13120 0 7 if (tokenize_url_email(tokens)) {
13121 0 0 while (network_index < network_length && network_offsets[network_index] < current)
0 0 while (network_index < network_length && network_offsets[network_index] < current)
0 0 while (network_index < network_length && network_offsets[network_index] < current)
13122 0 0 if (network_outcomes[network_index++].outcome == gru_tokenizer_network::END_OF_SENTENCE && !tokens.empty())
0 0 if (network_outcomes[network_index++].outcome == gru_tokenizer_network::END_OF_SENTENCE && !tokens.empty())
0 0 if (network_outcomes[network_index++].outcome == gru_tokenizer_network::END_OF_SENTENCE && !tokens.empty())
13129 22 0 do {
13132 22 7 if (outcome != gru_tokenizer_network::NO_SPLIT) break;
13141 1 33 if (network_index >= network_length) {
13150 34 1 for (size_t offset = current;
13151 34 1 network_offsets.push_back(offset), offset < chars.size() - 1 && network_length < segment;
0 34 network_offsets.push_back(offset), offset < chars.size() - 1 && network_length < segment;
13153 5 29 if (is_space(offset)) {
13155 4 1 while (offset + 1 < chars.size() - 1 && is_space(offset + 1)) offset++;
4 0 while (offset + 1 < chars.size() - 1 && is_space(offset + 1)) offset++;
0 5 while (offset + 1 < chars.size() - 1 && is_space(offset + 1)) offset++;
13161 1 0 if (network_length < segment && network_chars.back().chr != ' ')
1 0 if (network_length < segment && network_chars.back().chr != ' ')
0 1 if (network_length < segment && network_chars.back().chr != ' ')
13169 33 1 for (size_t i = 0; i < network_length - 1; i++)
13170 28 5 if (is_space(network_offsets[i+1])) {
13173 1 4 if (i + 2 == network_length) eos = true;
13174 0 5 for (size_t j = network_offsets[i+1]; j + 1 < network_offsets[i+2] && !eos; j++)
0 0 for (size_t j = network_offsets[i+1]; j + 1 < network_offsets[i+2] && !eos; j++)
0 5 for (size_t j = network_offsets[i+1]; j + 1 < network_offsets[i+2] && !eos; j++)
13175 0 0 eos = (chars[j].chr == '\n' && chars[j+1].chr == '\n') ||
0 0 eos = (chars[j].chr == '\n' && chars[j+1].chr == '\n') ||
0 0 eos = (chars[j].chr == '\n' && chars[j+1].chr == '\n') ||
13176 0 0 (j + 3 < network_offsets[i+2] && chars[j].chr == '\r' && chars[j+1].chr == '\n' && chars[j+2].chr == '\r' && chars[j+3].chr == '\n');
0 0 (j + 3 < network_offsets[i+2] && chars[j].chr == '\r' && chars[j+1].chr == '\n' && chars[j+2].chr == '\r' && chars[j+3].chr == '\n');
0 0 (j + 3 < network_offsets[i+2] && chars[j].chr == '\r' && chars[j+1].chr == '\n' && chars[j+2].chr == '\r' && chars[j+3].chr == '\n');
0 0 (j + 3 < network_offsets[i+2] && chars[j].chr == '\r' && chars[j+1].chr == '\n' && chars[j+2].chr == '\r' && chars[j+3].chr == '\n');
13177 1 4 if (eos) network_outcomes[i].outcome = gru_tokenizer_network::END_OF_SENTENCE;
13179 1 4 if (network_outcomes[i].outcome == gru_tokenizer_network::NO_SPLIT)
13181 0 4 if (!allow_spaces || network_outcomes[i+1].outcome == gru_tokenizer_network::END_OF_TOKEN)
0 0 if (!allow_spaces || network_outcomes[i+1].outcome == gru_tokenizer_network::END_OF_TOKEN)
0 4 if (!allow_spaces || network_outcomes[i+1].outcome == gru_tokenizer_network::END_OF_TOKEN)
13186 0 1 if (network_length == segment && network_length >= 10) {
0 0 if (network_length == segment && network_length >= 10) {
13188 0 0 while (network_length > segment / 2)
13189 0 0 if (network_outcomes[--network_length].outcome != gru_tokenizer_network::NO_SPLIT)
13251 1 0 if (!is.get(version)) return false;
13252 1 0 if (!(version >= 1 && version <= 2)) return false;
13255 1 0 if (!compressor::load(is, data)) return false;
1 0 if (!compressor::load(is, data)) return false;
13258 1 0 url_email_tokenizer = data.next_1B();
13259 1 0 segment = data.next_2B();
13260 0 1 allow_spaces = version >= 2 ? data.next_1B() : false /*false was default for version 1*/;
0 0 allow_spaces = version >= 2 ? data.next_1B() : false /*false was default for version 1*/;
0 0 allow_spaces = version >= 2 ? data.next_1B() : false /*false was default for version 1*/;
13262 1 0 network.reset(gru_tokenizer_network::load(data));
13263 1 0 if (!network) return false;
0 0 if (!network) return false;
13289 1 0 if (data.next_1B() != 1) return nullptr;
13352 0 0 class gru_tokenizer_network_trainer : public gru_tokenizer_network_implementation {
0 0 class gru_tokenizer_network_trainer : public gru_tokenizer_network_implementation {
0 0 class gru_tokenizer_network_trainer : public gru_tokenizer_network_implementation {
13370 0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
0 0 matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
13380 0 0 updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {}
0 0 updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {}
0 0 updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {}
0 0 updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {}
0 0 updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {}
0 0 updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {}
0 0 updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {}
0 0 updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {}
0 0 updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {}
0 0 updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {}
0 0 updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {}
0 0 updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {}
0 0 updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {}
0 0 updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {}
0 0 updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {}
13405 0 0 if (segment < 10) return error.assign("Segment size must be at least 10!"), false;
0 0 if (segment < 10) return error.assign("Segment size must be at least 10!"), false;
0 0 if (segment < 10) return error.assign("Segment size must be at least 10!"), false;
13408 0 0 for (auto&& sentence : data)
0 0 for (auto&& sentence : data)
0 0 for (auto&& sentence : data)
13410 0 0 if (characters < segment) return error.assign("Not enought training data for the gru_tokenizer!"), false;
0 0 if (characters < segment) return error.assign("Not enought training data for the gru_tokenizer!"), false;
0 0 if (characters < segment) return error.assign("Not enought training data for the gru_tokenizer!"), false;
13418 0 0 for (auto&& sentence : data)
0 0 for (auto&& sentence : data)
0 0 for (auto&& sentence : data)
13419 0 0 for (auto&& chr : sentence.sentence)
0 0 for (auto&& chr : sentence.sentence)
0 0 for (auto&& chr : sentence.sentence)
13435 0 0 for (auto&& embedding : this->embeddings)
0 0 for (auto&& embedding : this->embeddings)
0 0 for (auto&& embedding : this->embeddings)
13437 0 0 vector*> chosen_embeddings(segment);
0 0 vector*> chosen_embeddings(segment);
0 0 vector*> chosen_embeddings(segment);
13438 0 0 vector> embedding_dropouts(segment);
0 0 vector> embedding_dropouts(segment);
0 0 vector> embedding_dropouts(segment);
13439 0 0 gru_trainer gru_fwd(this->gru_fwd, segment), gru_bwd(this->gru_bwd, segment);
0 0 gru_trainer gru_fwd(this->gru_fwd, segment), gru_bwd(this->gru_bwd, segment);
0 0 gru_trainer gru_fwd(this->gru_fwd, segment), gru_bwd(this->gru_bwd, segment);
0 0 gru_trainer gru_fwd(this->gru_fwd, segment), gru_bwd(this->gru_bwd, segment);
0 0 gru_trainer gru_fwd(this->gru_fwd, segment), gru_bwd(this->gru_bwd, segment);
0 0 gru_trainer gru_fwd(this->gru_fwd, segment), gru_bwd(this->gru_bwd, segment);
13447 0 0 vector training_input, instance_input(segment);
0 0 vector training_input, instance_input(segment);
0 0 vector training_input, instance_input(segment);
13448 0 0 vector training_output, instance_output(segment);
0 0 vector training_output, instance_output(segment);
0 0 vector training_output, instance_output(segment);
13449 0 0 vector permutation; for (size_t i = 0; i < data.size(); i++) permutation.push_back(permutation.size());
0 0 vector permutation; for (size_t i = 0; i < data.size(); i++) permutation.push_back(permutation.size());
0 0 vector permutation; for (size_t i = 0; i < data.size(); i++) permutation.push_back(permutation.size());
13450 0 0 for (unsigned epoch = 0; epoch < epochs; epoch++) {
0 0 for (unsigned epoch = 0; epoch < epochs; epoch++) {
0 0 for (unsigned epoch = 0; epoch < epochs; epoch++) {
13454 0 0 for (int instance = 0, instances = 10000; instance < instances; instance++) {
0 0 for (int instance = 0, instances = 10000; instance < instances; instance++) {
0 0 for (int instance = 0, instances = 10000; instance < instances; instance++) {
13456 0 0 if (training_offset + segment >= training_input.size()) {
0 0 if (training_offset + segment >= training_input.size()) {
0 0 if (training_offset + segment >= training_input.size()) {
13459 0 0 for (auto&& index : permutation) {
0 0 for (auto&& index : permutation) {
0 0 for (auto&& index : permutation) {
13461 0 0 if (sentence.tokens.empty()) continue;
0 0 if (sentence.tokens.empty()) continue;
0 0 if (sentence.tokens.empty()) continue;
13464 0 0 training_input.resize(training_offset + sentence.sentence.size());
0 0 training_input.resize(training_offset + sentence.sentence.size());
0 0 training_input.resize(training_offset + sentence.sentence.size());
13465 0 0 training_output.resize(training_offset + sentence.sentence.size());
0 0 training_output.resize(training_offset + sentence.sentence.size());
0 0 training_output.resize(training_offset + sentence.sentence.size());
13466 0 0 for (size_t i = 0; i < sentence.sentence.size(); i++) {
0 0 for (size_t i = 0; i < sentence.sentence.size(); i++) {
0 0 for (size_t i = 0; i < sentence.sentence.size(); i++) {
13470 0 0 for (size_t i = 0; i < sentence.tokens.size(); i++)
0 0 for (size_t i = 0; i < sentence.tokens.size(); i++)
0 0 for (size_t i = 0; i < sentence.tokens.size(); i++)
13471 0 0 training_output[training_offset + sentence.tokens[i].start + sentence.tokens[i].length - 1].outcome =
0 0 training_output[training_offset + sentence.tokens[i].start + sentence.tokens[i].length - 1].outcome =
0 0 training_output[training_offset + sentence.tokens[i].start + sentence.tokens[i].length - 1].outcome =
13480 0 0 for (training_shift = segment - 5; training_shift > segment / 2; training_shift--)
0 0 for (training_shift = segment - 5; training_shift > segment / 2; training_shift--)
0 0 for (training_shift = segment - 5; training_shift > segment / 2; training_shift--)
13481 0 0 if (instance_output[training_shift-1].outcome != gru_tokenizer_network::NO_SPLIT || instance_input[training_shift-1].chr == ' ')
0 0 if (instance_output[training_shift-1].outcome != gru_tokenizer_network::NO_SPLIT || instance_input[training_shift-1].chr == ' ')
0 0 if (instance_output[training_shift-1].outcome != gru_tokenizer_network::NO_SPLIT || instance_input[training_shift-1].chr == ' ')
0 0 if (instance_output[training_shift-1].outcome != gru_tokenizer_network::NO_SPLIT || instance_input[training_shift-1].chr == ' ')
0 0 if (instance_output[training_shift-1].outcome != gru_tokenizer_network::NO_SPLIT || instance_input[training_shift-1].chr == ' ')
0 0 if (instance_output[training_shift-1].outcome != gru_tokenizer_network::NO_SPLIT || instance_input[training_shift-1].chr == ' ')
0 0 if (instance_output[training_shift-1].outcome != gru_tokenizer_network::NO_SPLIT || instance_input[training_shift-1].chr == ' ')
0 0 if (instance_output[training_shift-1].outcome != gru_tokenizer_network::NO_SPLIT || instance_input[training_shift-1].chr == ' ')
0 0 if (instance_output[training_shift-1].outcome != gru_tokenizer_network::NO_SPLIT || instance_input[training_shift-1].chr == ' ')
13486 0 0 for (unsigned i = 0; i < segment; i++) {
0 0 for (unsigned i = 0; i < segment; i++) {
0 0 for (unsigned i = 0; i < segment; i++) {
13488 0 0 for (unsigned k = 0; k < D; k++)
0 0 for (unsigned k = 0; k < D; k++)
0 0 for (unsigned k = 0; k < D; k++)
13489 0 0 embedding_dropouts[i].w[0][k] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier;
0 0 embedding_dropouts[i].w[0][k] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier;
0 0 embedding_dropouts[i].w[0][k] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier;
0 0 embedding_dropouts[i].w[0][k] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier;
0 0 embedding_dropouts[i].w[0][k] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier;
0 0 embedding_dropouts[i].w[0][k] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier;
13490 0 0 for (int j = 0; j < 3; j++)
0 0 for (int j = 0; j < 3; j++)
0 0 for (int j = 0; j < 3; j++)
13494 0 0 for (int dir = 0; dir < 2; dir++) {
0 0 for (int dir = 0; dir < 2; dir++) {
0 0 for (int dir = 0; dir < 2; dir++) {
13495 0 0 auto& gru = dir == 0 ? gru_fwd : gru_bwd;
0 0 auto& gru = dir == 0 ? gru_fwd : gru_bwd;
0 0 auto& gru = dir == 0 ? gru_fwd : gru_bwd;
13496 0 0 auto& projection = dir == 0 ? projection_fwd : projection_bwd;
0 0 auto& projection = dir == 0 ? projection_fwd : projection_bwd;
0 0 auto& projection = dir == 0 ? projection_fwd : projection_bwd;
13499 0 0 for (size_t i = 0; i < segment; i++) {
0 0 for (size_t i = 0; i < segment; i++) {
0 0 for (size_t i = 0; i < segment; i++) {
13500 0 0 auto& embedding = chosen_embeddings[dir == 0 ? i : segment - 1 - i];
0 0 auto& embedding = chosen_embeddings[dir == 0 ? i : segment - 1 - i];
0 0 auto& embedding = chosen_embeddings[dir == 0 ? i : segment - 1 - i];
13501 0 0 auto& embedding_dropout = embedding_dropouts[dir == 0 ? i : segment - 1 - i];
0 0 auto& embedding_dropout = embedding_dropouts[dir == 0 ? i : segment - 1 - i];
0 0 auto& embedding_dropout = embedding_dropouts[dir == 0 ? i : segment - 1 - i];
13502 0 0 auto& output = instance_output[dir == 0 ? i : segment - 1 - i];
0 0 auto& output = instance_output[dir == 0 ? i : segment - 1 - i];
0 0 auto& output = instance_output[dir == 0 ? i : segment - 1 - i];
13504 0 0 for (int j = 0; j < D; j++) {
0 0 for (int j = 0; j < D; j++) {
0 0 for (int j = 0; j < D; j++) {
13507 0 0 for (int k = 0; k < D; k++) {
0 0 for (int k = 0; k < D; k++) {
0 0 for (int k = 0; k < D; k++) {
13515 0 0 for (int j = 0; j < D; j++) {
0 0 for (int j = 0; j < D; j++) {
0 0 for (int j = 0; j < D; j++) {
13517 0 0 for (int k = 0; k < D; k++)
0 0 for (int k = 0; k < D; k++)
0 0 for (int k = 0; k < D; k++)
13523 0 0 for (int j = 0; j < D; j++)
0 0 for (int j = 0; j < D; j++)
0 0 for (int j = 0; j < D; j++)
13524 0 0 gru.dropouts[i].w[0][j] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier * gru.states[i+1].w[0][j];
0 0 gru.dropouts[i].w[0][j] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier * gru.states[i+1].w[0][j];
0 0 gru.dropouts[i].w[0][j] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier * gru.states[i+1].w[0][j];
0 0 gru.dropouts[i].w[0][j] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier * gru.states[i+1].w[0][j];
0 0 gru.dropouts[i].w[0][j] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier * gru.states[i+1].w[0][j];
0 0 gru.dropouts[i].w[0][j] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier * gru.states[i+1].w[0][j];
13526 0 0 for (int j = 0; j < 3; j++)
0 0 for (int j = 0; j < 3; j++)
0 0 for (int j = 0; j < 3; j++)
13527 0 0 for (int k = 0; k < D; k++)
0 0 for (int k = 0; k < D; k++)
0 0 for (int k = 0; k < D; k++)
13532 0 0 for (auto&& output : instance_output) {
0 0 for (auto&& output : instance_output) {
0 0 for (auto&& output : instance_output) {
13534 0 0 if (output.w[2] > output.w[best]) best = 2;
0 0 if (output.w[2] > output.w[best]) best = 2;
0 0 if (output.w[2] > output.w[best]) best = 2;
13536 0 0 for (int j = 0; j < 3; j++) sum += (output.w[j] = exp(output.w[j] - maximum));
0 0 for (int j = 0; j < 3; j++) sum += (output.w[j] = exp(output.w[j] - maximum));
0 0 for (int j = 0; j < 3; j++) sum += (output.w[j] = exp(output.w[j] - maximum));
13538 0 0 for (int j = 0; j < 3; j++) output.w[j] *= sum;
0 0 for (int j = 0; j < 3; j++) output.w[j] *= sum;
0 0 for (int j = 0; j < 3; j++) output.w[j] *= sum;
13546 0 0 for (auto&& output : instance_output)
0 0 for (auto&& output : instance_output)
0 0 for (auto&& output : instance_output)
13547 0 0 for (int j = 0; j < 3; j++)
0 0 for (int j = 0; j < 3; j++)
0 0 for (int j = 0; j < 3; j++)
13548 0 0 output.w[j] = (output.outcome == j) - output.w[j];
0 0 output.w[j] = (output.outcome == j) - output.w[j];
0 0 output.w[j] = (output.outcome == j) - output.w[j];
13550 0 0 for (int dir = 0; dir < 2; dir++) {
0 0 for (int dir = 0; dir < 2; dir++) {
0 0 for (int dir = 0; dir < 2; dir++) {
13551 0 0 auto& gru = dir == 0 ? gru_fwd : gru_bwd;
0 0 auto& gru = dir == 0 ? gru_fwd : gru_bwd;
0 0 auto& gru = dir == 0 ? gru_fwd : gru_bwd;
13552 0 0 auto& projection = dir == 0 ? projection_fwd : projection_bwd;
0 0 auto& projection = dir == 0 ? projection_fwd : projection_bwd;
0 0 auto& projection = dir == 0 ? projection_fwd : projection_bwd;
13556 0 0 for (size_t i = segment; i--; ) {
0 0 for (size_t i = segment; i--; ) {
0 0 for (size_t i = segment; i--; ) {
13557 0 0 auto& embedding = chosen_embeddings[dir == 0 ? i : segment - 1 - i];
0 0 auto& embedding = chosen_embeddings[dir == 0 ? i : segment - 1 - i];
0 0 auto& embedding = chosen_embeddings[dir == 0 ? i : segment - 1 - i];
13558 0 0 auto& embedding_dropout = embedding_dropouts[dir == 0 ? i : segment - 1 - i];
0 0 auto& embedding_dropout = embedding_dropouts[dir == 0 ? i : segment - 1 - i];
0 0 auto& embedding_dropout = embedding_dropouts[dir == 0 ? i : segment - 1 - i];
13559 0 0 auto& output = instance_output[dir == 0 ? i : segment - 1 - i];
0 0 auto& output = instance_output[dir == 0 ? i : segment - 1 - i];
0 0 auto& output = instance_output[dir == 0 ? i : segment - 1 - i];
13561 0 0 for (int j = 0; j < D; j++) // These for cycles are swapped because
0 0 for (int j = 0; j < D; j++) // These for cycles are swapped because
0 0 for (int j = 0; j < D; j++) // These for cycles are swapped because
13562 0 0 for (int k = 0; k < 3; k++) // g++-4.8 generates wrong code otherwise.
0 0 for (int k = 0; k < 3; k++) // g++-4.8 generates wrong code otherwise.
0 0 for (int k = 0; k < 3; k++) // g++-4.8 generates wrong code otherwise.
13565 0 0 for (int j = 0; j < D; j++)
0 0 for (int j = 0; j < D; j++)
0 0 for (int j = 0; j < D; j++)
13566 0 0 if (gru.dropouts[i].w[0][j])
0 0 if (gru.dropouts[i].w[0][j])
0 0 if (gru.dropouts[i].w[0][j])
13567 0 0 for (int k = 0; k < 3; k++)
0 0 for (int k = 0; k < 3; k++)
0 0 for (int k = 0; k < 3; k++)
13571 0 0 for (int j = 0; j < D; j++) {
0 0 for (int j = 0; j < D; j++) {
0 0 for (int j = 0; j < D; j++) {
13578 0 0 for (int k = 0; k < D; k++) {
0 0 for (int k = 0; k < D; k++) {
0 0 for (int k = 0; k < D; k++) {
13585 0 0 for (int j = 0; j < D; j++) {
0 0 for (int j = 0; j < D; j++) {
0 0 for (int j = 0; j < D; j++) {
13594 0 0 for (int k = 0; k < D; k++) {
0 0 for (int k = 0; k < D; k++) {
0 0 for (int k = 0; k < D; k++) {
13608 0 0 if (batch_size == 1 ||
0 0 if (batch_size == 1 ||
0 0 if (batch_size == 1 ||
0 0 if (batch_size == 1 ||
0 0 if (batch_size == 1 ||
0 0 if (batch_size == 1 ||
0 0 if (batch_size == 1 ||
0 0 if (batch_size == 1 ||
0 0 if (batch_size == 1 ||
13615 0 0 if (batch_size == 1)
0 0 if (batch_size == 1)
0 0 if (batch_size == 1)
13616 0 0 for (auto&& chosen_embedding : chosen_embeddings)
0 0 for (auto&& chosen_embedding : chosen_embeddings)
0 0 for (auto&& chosen_embedding : chosen_embeddings)
13619 0 0 for (auto&& embedding : embeddings)
0 0 for (auto&& embedding : embeddings)
0 0 for (auto&& embedding : embeddings)
13627 0 0 if (learning_rate_final && learning_rate_final != learning_rate_initial)
0 0 if (learning_rate_final && learning_rate_final != learning_rate_initial)
0 0 if (learning_rate_final && learning_rate_final != learning_rate_initial)
13631 0 0 cerr << "Epoch " << epoch+1 << ", logprob: " << scientific << setprecision(4) << logprob
0 0 cerr << "Epoch " << epoch+1 << ", logprob: " << scientific << setprecision(4) << logprob
0 0 cerr << "Epoch " << epoch+1 << ", logprob: " << scientific << setprecision(4) << logprob
13633 0 0 if (!heldout.empty()) {
0 0 if (!heldout.empty()) {
0 0 if (!heldout.empty()) {
13635 0 0 evaluate(url_email_tokenizer, segment, allow_spaces, heldout, tokens, sentences);
0 0 evaluate(url_email_tokenizer, segment, allow_spaces, heldout, tokens, sentences);
0 0 evaluate(url_email_tokenizer, segment, allow_spaces, heldout, tokens, sentences);
13636 0 0 cerr << ", heldout tokens: " << 100. * tokens.precision << "%P/" << 100. * tokens.recall << "%R/"
0 0 cerr << ", heldout tokens: " << 100. * tokens.precision << "%P/" << 100. * tokens.recall << "%R/"
0 0 cerr << ", heldout tokens: " << 100. * tokens.precision << "%P/" << 100. * tokens.recall << "%R/"
13640 0 0 if (early_stopping && sentences.f1 + tokens.f1 > best_combined_f1) {
0 0 if (early_stopping && sentences.f1 + tokens.f1 > best_combined_f1) {
0 0 if (early_stopping && sentences.f1 + tokens.f1 > best_combined_f1) {
0 0 if (early_stopping && sentences.f1 + tokens.f1 > best_combined_f1) {
0 0 if (early_stopping && sentences.f1 + tokens.f1 > best_combined_f1) {
0 0 if (early_stopping && sentences.f1 + tokens.f1 > best_combined_f1) {
13645 0 0 if (early_stopping && best_combined_f1 && epoch - best_combined_f1_epoch > 30) {
0 0 if (early_stopping && best_combined_f1 && epoch - best_combined_f1_epoch > 30) {
0 0 if (early_stopping && best_combined_f1 && epoch - best_combined_f1_epoch > 30) {
0 0 if (early_stopping && best_combined_f1 && epoch - best_combined_f1_epoch > 30) {
0 0 if (early_stopping && best_combined_f1 && epoch - best_combined_f1_epoch > 30) {
0 0 if (early_stopping && best_combined_f1 && epoch - best_combined_f1_epoch > 30) {
13654 0 0 if (early_stopping && best_combined_f1) {
0 0 if (early_stopping && best_combined_f1) {
0 0 if (early_stopping && best_combined_f1) {
13664 0 0 enc.add_1B(1);
0 0 enc.add_1B(1);
0 0 enc.add_1B(1);
13665 0 0 enc.add_1B(D);
0 0 enc.add_1B(D);
0 0 enc.add_1B(D);
13668 0 0 for (auto&& embedding : this->embeddings) {
0 0 for (auto&& embedding : this->embeddings) {
0 0 for (auto&& embedding : this->embeddings) {
13672 0 0 save_gru(this->gru_fwd, enc);
0 0 save_gru(this->gru_fwd, enc);
0 0 save_gru(this->gru_fwd, enc);
13673 0 0 save_gru(this->gru_bwd, enc);
0 0 save_gru(this->gru_bwd, enc);
0 0 save_gru(this->gru_bwd, enc);
13674 0 0 save_matrix(this->projection_fwd, enc);
0 0 save_matrix(this->projection_fwd, enc);
0 0 save_matrix(this->projection_fwd, enc);
13675 0 0 save_matrix(this->projection_bwd, enc);
0 0 save_matrix(this->projection_bwd, enc);
0 0 save_matrix(this->projection_bwd, enc);
13682 0 0 for (int i = 0; i < R; i++) {
0 0 for (int i = 0; i < R; i++) {
0 0 for (int i = 0; i < R; i++) {
0 0 for (int i = 0; i < R; i++) {
0 0 for (int i = 0; i < R; i++) {
0 0 for (int i = 0; i < R; i++) {
0 0 for (int i = 0; i < R; i++) {
0 0 for (int i = 0; i < R; i++) {
0 0 for (int i = 0; i < R; i++) {
13683 0 0 for (int j = 0; j < C; j++) {
0 0 for (int j = 0; j < C; j++) {
0 0 for (int j = 0; j < C; j++) {
0 0 for (int j = 0; j < C; j++) {
0 0 for (int j = 0; j < C; j++) {
0 0 for (int j = 0; j < C; j++) {
0 0 for (int j = 0; j < C; j++) {
0 0 for (int j = 0; j < C; j++) {
0 0 for (int j = 0; j < C; j++) {
13693 0 0 for (int i = 0; i < R; i++) {
0 0 for (int i = 0; i < R; i++) {
0 0 for (int i = 0; i < R; i++) {
0 0 for (int i = 0; i < R; i++) {
0 0 for (int i = 0; i < R; i++) {
0 0 for (int i = 0; i < R; i++) {
0 0 for (int i = 0; i < R; i++) {
0 0 for (int i = 0; i < R; i++) {
0 0 for (int i = 0; i < R; i++) {
13694 0 0 for (int j = 0; j < C; j++)
0 0 for (int j = 0; j < C; j++)
0 0 for (int j = 0; j < C; j++)
0 0 for (int j = 0; j < C; j++)
0 0 for (int j = 0; j < C; j++)
0 0 for (int j = 0; j < C; j++)
0 0 for (int j = 0; j < C; j++)
0 0 for (int j = 0; j < C; j++)
0 0 for (int j = 0; j < C; j++)
13716 0 0 for (auto&& sentence : heldout) {
0 0 for (auto&& sentence : heldout) {
0 0 for (auto&& sentence : heldout) {
13717 0 0 if (sentence.tokens.empty()) continue;
0 0 if (sentence.tokens.empty()) continue;
0 0 if (sentence.tokens.empty()) continue;
13719 0 0 gold_sentences.emplace_back(text.size() + sentence.tokens.front().start, sentence.tokens.back().start + sentence.tokens.back().length - sentence.tokens.front().start);
0 0 gold_sentences.emplace_back(text.size() + sentence.tokens.front().start, sentence.tokens.back().start + sentence.tokens.back().length - sentence.tokens.front().start);
0 0 gold_sentences.emplace_back(text.size() + sentence.tokens.front().start, sentence.tokens.back().start + sentence.tokens.back().length - sentence.tokens.front().start);
13720 0 0 for (auto&& token : sentence.tokens)
0 0 for (auto&& token : sentence.tokens)
0 0 for (auto&& token : sentence.tokens)
13721 0 0 gold_tokens.emplace_back(text.size() + token.start, token.length);
0 0 gold_tokens.emplace_back(text.size() + token.start, token.length);
0 0 gold_tokens.emplace_back(text.size() + token.start, token.length);
13731 0 0 unilib::utf8::encode(text, text_utf8);
0 0 unilib::utf8::encode(text, text_utf8);
0 0 unilib::utf8::encode(text, text_utf8);
13732 0 0 tokenizer.set_text(text_utf8);
0 0 tokenizer.set_text(text_utf8);
0 0 tokenizer.set_text(text_utf8);
13734 0 0 while (tokenizer.next_sentence(tokens))
0 0 while (tokenizer.next_sentence(tokens))
0 0 while (tokenizer.next_sentence(tokens))
0 0 while (tokenizer.next_sentence(tokens))
0 0 while (tokenizer.next_sentence(tokens))
0 0 while (tokenizer.next_sentence(tokens))
13735 0 0 if (!tokens.empty()) {
0 0 if (!tokens.empty()) {
0 0 if (!tokens.empty()) {
13736 0 0 system_sentences.emplace_back(tokens.front().start, tokens.back().start + tokens.back().length - tokens.front().start);
0 0 system_sentences.emplace_back(tokens.front().start, tokens.back().start + tokens.back().length - tokens.front().start);
0 0 system_sentences.emplace_back(tokens.front().start, tokens.back().start + tokens.back().length - tokens.front().start);
13737 0 0 system_tokens.insert(system_tokens.end(), tokens.begin(), tokens.end());
0 0 system_tokens.insert(system_tokens.end(), tokens.begin(), tokens.end());
0 0 system_tokens.insert(system_tokens.end(), tokens.begin(), tokens.end());
13747 0 0 for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); )
0 0 for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); )
0 0 for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); )
0 0 for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); )
0 0 for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); )
0 0 for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); )
0 0 for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); )
0 0 for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); )
0 0 for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); )
13748 0 0 if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start))
0 0 if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start))
0 0 if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start))
0 0 if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start))
0 0 if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start))
0 0 if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start))
0 0 if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start))
0 0 if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start))
0 0 if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start))
0 0 if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start))
0 0 if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start))
0 0 if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start))
13750 0 0 else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start))
0 0 else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start))
0 0 else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start))
0 0 else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start))
0 0 else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start))
0 0 else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start))
0 0 else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start))
0 0 else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start))
0 0 else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start))
0 0 else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start))
0 0 else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start))
0 0 else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start))
13755 0 0 f1.precision = system.size() ? both / double(system.size()) : 0.;
0 0 f1.precision = system.size() ? both / double(system.size()) : 0.;
0 0 f1.precision = system.size() ? both / double(system.size()) : 0.;
13756 0 0 f1.recall = gold.size() ? both / double(gold.size()) : 0.;
0 0 f1.recall = gold.size() ? both / double(gold.size()) : 0.;
0 0 f1.recall = gold.size() ? both / double(gold.size()) : 0.;
13757 0 0 f1.f1 = system.size()+gold.size() ? 2 * both / double(system.size() + gold.size()) : 0.;
0 0 f1.f1 = system.size()+gold.size() ? 2 * both / double(system.size() + gold.size()) : 0.;
0 0 f1.f1 = system.size()+gold.size() ? 2 * both / double(system.size() + gold.size()) : 0.;
13763 0 0 for (int i = 0; i < R; i++) {
0 0 for (int i = 0; i < R; i++) {
0 0 for (int i = 0; i < R; i++) {
0 0 for (int i = 0; i < R; i++) {
0 0 for (int i = 0; i < R; i++) {
0 0 for (int i = 0; i < R; i++) {
0 0 for (int i = 0; i < R; i++) {
0 0 for (int i = 0; i < R; i++) {
0 0 for (int i = 0; i < R; i++) {
13765 0 0 for (int j = 0; j < C; j++)
0 0 for (int j = 0; j < C; j++)
0 0 for (int j = 0; j < C; j++)
0 0 for (int j = 0; j < C; j++)
0 0 for (int j = 0; j < C; j++)
0 0 for (int j = 0; j < C; j++)
0 0 for (int j = 0; j < C; j++)
0 0 for (int j = 0; j < C; j++)
0 0 for (int j = 0; j < C; j++)
13782 0 0 for (int i = 0; i < R; i++)
0 0 for (int i = 0; i < R; i++)
0 0 for (int i = 0; i < R; i++)
0 0 for (int i = 0; i < R; i++)
0 0 for (int i = 0; i < R; i++)
0 0 for (int i = 0; i < R; i++)
13826 0 0 enc.add_1B(url_email_tokenizer);
13827 0 0 enc.add_2B(segment);
13828 0 0 enc.add_1B(allow_spaces);
13831 0 0 if (dimension == 16) {
13833 0 0 if (!network.train(url_email_tokenizer, segment, allow_spaces, epochs, batch_size, learning_rate, learning_rate_final,
0 0 if (!network.train(url_email_tokenizer, segment, allow_spaces, epochs, batch_size, learning_rate, learning_rate_final,
13835 0 0 } else if (dimension == 24) {
13837 0 0 if (!network.train(url_email_tokenizer, segment, allow_spaces, epochs, batch_size, learning_rate, learning_rate_final,
0 0 if (!network.train(url_email_tokenizer, segment, allow_spaces, epochs, batch_size, learning_rate, learning_rate_final,
13839 0 0 } else if (dimension == 64) {
13841 0 0 if (!network.train(url_email_tokenizer, segment, allow_spaces, epochs, batch_size, learning_rate, learning_rate_final,
0 0 if (!network.train(url_email_tokenizer, segment, allow_spaces, epochs, batch_size, learning_rate, learning_rate_final,
13844 0 0 return error.assign("Gru tokenizer dimension '").append(to_string(dimension)).append("' is not supported!"), false;
0 0 return error.assign("Gru tokenizer dimension '").append(to_string(dimension)).append("' is not supported!"), false;
13849 0 0 for (auto&& sentence : data)
13850 0 0 for (auto&& chr : sentence.sentence)
13854 0 0 for (auto&& count : counts) {
13857 0 0 for (auto&& chr : count.second)
13858 0 0 if (chr.second > best)
13860 0 0 if (best_chr)
13863 0 0 enc.add_1B(unknown_chars.size());
13864 0 0 for (auto&& unknown_char : unknown_chars) {
13869 0 0 if (!compressor::save(os, enc)) return error.assign("Cannot save gru_tokenizer_factory!"), false;
0 0 if (!compressor::save(os, enc)) return error.assign("Cannot save gru_tokenizer_factory!"), false;
0 0 if (!compressor::save(os, enc)) return error.assign("Cannot save gru_tokenizer_factory!"), false;
14241 0 0 initialize_ragel_map();
14245 0 1 while (ragel_map_flag.test_and_set()) {}
14246 1 0 if (ragel_map.empty()) {
14247 128 1 for (uint8_t ascii = 0; ascii < 128; ascii++)
14259 1 3 if (chr >= ragel_map.size())
14279 7 0 if ( ( current) == ( (chars.size() - 1)) )
14287 0 30 if ( _klen > 0 ) {
14292 0 0 if ( _upper < _lower )
14296 0 0 if ( _widec < _mid[0] )
14298 0 0 else if ( _widec > _mid[1] )
14304 0 0 if (
14310 0 0 if (
14324 30 0 if ( _klen > 0 ) {
14329 87 30 if ( _upper < _lower )
14333 13 74 if ( _widec < *_mid )
14335 74 0 else if ( _widec > *_mid )
14347 30 0 if ( _klen > 0 ) {
14352 86 7 if ( _upper < _lower )
14356 9 77 if ( _widec < _mid[0] )
14358 54 23 else if ( _widec > _mid[1] )
14372 0 30 if ( _ragel_url_email_trans_actions[_trans] == 0 )
14393 23 7 if ( cs == 0 )
14395 23 0 if ( ++( current) != ( (chars.size() - 1)) )
14401 0 7 if (end > start) {
14430 0 0 vertical_tokenizer() : unicode_tokenizer(0) {}
14528 0 0 if (res->load(is)) return res.release();
14534 1 0 if (res->load(is)) return res.release();
1 0 if (res->load(is)) return res.release();
14540 0 0 if (res->load(is)) return res.release();
0 0 if (res->load(is)) return res.release();
14551 0 0 ifstream f(path_from_utf8(fname).c_str(), ifstream::binary);
14552 0 0 if (!f) return nullptr;
14554 0 0 return load(f);
14575 1 0 ragel_tokenizer::initialize_ragel_map();
14577 1 0 set_text(string_piece(nullptr, 0));
14583 0 2 if (make_copy && text.str) {
0 0 if (make_copy && text.str) {
14590 34 2 for (const char* curr_str = text.str; text.len; curr_str = text.str)
14596 2 0 vector& tokens = tokens_ptr ? *tokens_ptr : tokens_buffer;
14598 2 0 if (forms) forms->clear();
14599 2 0 if (current >= chars.size() - 1) return false;
14602 2 0 if (forms)
14603 7 2 for (auto&& token : tokens)
14610 7 0 if (current >= chars.size() - 1) return false;
14612 7 0 return url_email_tokenizer ? ragel_tokenizer::ragel_url_email(url_email_tokenizer, chars, current, tokens) : false;
14619 0 8 return tokens.size() >= 500 ||
14620 8 0 (tokens.size() >= 450 && chars[tokens.back().start].cat & unicode::P) ||
0 0 (tokens.size() >= 450 && chars[tokens.back().start].cat & unicode::P) ||
0 8 (tokens.size() >= 450 && chars[tokens.back().start].cat & unicode::P) ||
14621 0 0 (tokens.size() >= 400 && chars[tokens.back().start].cat & unicode::Po);
14627 0 0 if (eos_chr == '.' && !tokens.empty()) {
0 0 if (eos_chr == '.' && !tokens.empty()) {
0 0 if (eos_chr == '.' && !tokens.empty()) {
14629 0 0 if (tokens.back().length == 1 && chars[tokens.back().start].cat & unicode::Lut)
0 0 if (tokens.back().length == 1 && chars[tokens.back().start].cat & unicode::Lut)
0 0 if (tokens.back().length == 1 && chars[tokens.back().start].cat & unicode::Lut)
14633 0 0 if (abbreviations) {
14635 0 0 for (size_t i = 0; i < tokens.back().length; i++)
14637 0 0 if (abbreviations->count(eos_buffer))
14662 0 0 if (current >= chars.size() - 1) return false;
14666 0 0 while (current < chars.size() - 1 && chars[current].chr != '\r' && chars[current].chr != '\n') current++;
0 0 while (current < chars.size() - 1 && chars[current].chr != '\r' && chars[current].chr != '\n') current++;
0 0 while (current < chars.size() - 1 && chars[current].chr != '\r' && chars[current].chr != '\n') current++;
0 0 while (current < chars.size() - 1 && chars[current].chr != '\r' && chars[current].chr != '\n') current++;
14669 0 0 if (current < chars.size() - 1) {
14671 0 0 if (current < chars.size() - 1 &&
0 0 if (current < chars.size() - 1 &&
0 0 if (current < chars.size() - 1 &&
14672 0 0 ((chars[current-1].chr == '\r' && chars[current].chr == '\n') ||
0 0 ((chars[current-1].chr == '\r' && chars[current].chr == '\n') ||
14673 0 0 (chars[current-1].chr == '\n' && chars[current].chr == '\r')))
14677 0 0 if (line_start < line_end)
14765 0 0 return {1, 11, 1, "dev"};
0 0 return {1, 11, 1, "dev"};
14776 0 0 << (morphodita.prerelease.empty() ? "" : "-") << morphodita.prerelease
0 0 << (morphodita.prerelease.empty() ? "" : "-") << morphodita.prerelease
14778 0 0 << (other_libraries.empty() ? "" : " and ") << other_libraries << ")\n"
0 0 << (other_libraries.empty() ? "" : " and ") << other_libraries << ")\n"
14780 0 0 "Mathematics and Physics, Charles University in Prague, Czech Republic.";
14803 0 1 assert(t);
14809 1 0 if (!t->nodes.empty()) stack.push_back(0);
14813 7 1 for (size_t i = t->nodes.size(); i > 1; i--)
14818 0 0 return buffer.empty() && stack.size() <= 1;
0 0 return buffer.empty() && stack.size() <= 1;
0 0 return buffer.empty() && stack.size() <= 1;
0 0 return buffer.empty() && stack.size() <= 1;
0 0 return buffer.empty() && stack.size() <= 1;
0 0 return buffer.empty() && stack.size() <= 1;
0 0 return buffer.empty() && stack.size() <= 1;
0 0 return buffer.empty() && stack.size() <= 1;
16 51 return buffer.empty() && stack.size() <= 1;
11 5 return buffer.empty() && stack.size() <= 1;
0 0 return buffer.empty() && stack.size() <= 1;
0 0 return buffer.empty() && stack.size() <= 1;
0 0 return buffer.empty() && stack.size() <= 1;
0 0 return buffer.empty() && stack.size() <= 1;
14881 1116 62 for (auto&& selector : selectors) {
14886 867 63 if (selector.start.second < int(conf.stack.size()))
14890 98 88 if (selector.start.second < int(conf.buffer.size()))
14896 965 151 if (current >= 0)
14897 802 410 for (auto&& direction : selector.directions) {
14901 0 0 current = node.head ? node.head : -1;
14904 120 281 current = direction.second >= 0 && direction.second < int(node.children.size()) ?
14906 127 274 direction.second < 0 && -direction.second <= int(node.children.size()) ?
14908 401 401 -1;
401 281 -1;
14911 247 555 if (current <= 0) break;
14924 1 0 split(description, '\n', lines);
14925 19 1 for (auto&& line : lines) {
14926 18 1 if (!line.len || line.str[0] == '#') continue;
18 0 if (!line.len || line.str[0] == '#') continue;
14929 18 0 split(line, ',', parts);
14932 18 0 split(parts[0], ' ', words);
14933 0 18 if (words.size() != 2)
14934 0 0 return error.assign("The node selector '").append(parts[0].str, parts[0].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false;
0 0 return error.assign("The node selector '").append(parts[0].str, parts[0].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false;
0 0 return error.assign("The node selector '").append(parts[0].str, parts[0].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false;
0 0 return error.assign("The node selector '").append(parts[0].str, parts[0].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false;
0 0 return error.assign("The node selector '").append(parts[0].str, parts[0].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false;
14937 15 3 if (words[0] == "stack")
14939 3 0 else if (words[0] == "buffer")
14942 0 0 return error.assign("Cannot parse starting location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false;
0 0 return error.assign("Cannot parse starting location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false;
0 0 return error.assign("Cannot parse starting location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false;
0 0 return error.assign("Cannot parse starting location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false;
0 0 return error.assign("Cannot parse starting location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false;
14945 18 0 if (!parse_int(words[1], "starting index", start_index, error)) return false;
18 0 if (!parse_int(words[1], "starting index", start_index, error)) return false;
14947 18 0 selectors.emplace_back(start, start_index);
14950 16 18 for (size_t i = 1; i < parts.size(); i++) {
14951 16 0 split(parts[i], ' ', words);
14952 0 16 if (words.empty())
14953 0 0 return error.assign("Empty node selector on line '").append(line.str, line.len).append(".!"), false;
0 0 return error.assign("Empty node selector on line '").append(line.str, line.len).append(".!"), false;
0 0 return error.assign("Empty node selector on line '").append(line.str, line.len).append(".!"), false;
14955 0 16 if (words[0] == "parent") {
14956 0 0 if (words.size() != 1)
14957 0 0 return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain one space separated value!"), false;
0 0 return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain one space separated value!"), false;
0 0 return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain one space separated value!"), false;
0 0 return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain one space separated value!"), false;
0 0 return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain one space separated value!"), false;
14958 0 0 selectors.back().directions.emplace_back(PARENT, 0);
14959 16 0 } else if (words[0] == "child") {
14960 0 16 if (words.size() != 2)
14961 0 0 return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false;
0 0 return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false;
0 0 return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false;
0 0 return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false;
0 0 return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false;
14963 16 0 if (!parse_int(words[1], "child index", child_index, error)) return false;
16 0 if (!parse_int(words[1], "child index", child_index, error)) return false;
14964 16 0 selectors.back().directions.emplace_back(CHILD, child_index);
14966 0 0 return error.assign("Cannot parse direction location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false;
0 0 return error.assign("Cannot parse direction location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false;
0 0 return error.assign("Cannot parse direction location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false;
0 0 return error.assign("Cannot parse direction location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false;
0 0 return error.assign("Cannot parse direction location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false;
15029 0 0 if (!n.misc.empty()) {
15032 0 0 if (lid != string::npos) {
15037 0 0 if (lid_end == string::npos) lid_end = n.misc.size();
15067 1 3 if (description == "form")
15069 0 3 else if (description == "lemma")
15071 0 3 else if (description == "lemma_id")
15073 0 3 else if (description == "tag")
15075 1 2 else if (description == "universal_tag")
15077 1 1 else if (description == "feats")
15079 0 1 else if (description == "universal_tag_feats")
15081 1 0 else if (description == "deprel")
15150 92 36 if (it != dictionary.end()) return it->second;
15157 18 36 for (auto&& chr : utf8::decoder(word)) {
15158 3 15 (first ? first_category : other_categories) |= unicode::category(chr);
15162 0 36 if ((first_category & unicode::Lut) && (other_categories & unicode::Lut)) {
0 0 if ((first_category & unicode::Lut) && (other_categories & unicode::Lut)) {
15166 0 0 for (auto&& chr : utf8::decoder(word)) {
15167 0 0 utf8::append(buffer, first ? chr : unicode::lowercase(chr));
15172 0 0 if (it != dictionary.end()) return it->second;
15175 36 0 if ((first_category & unicode::Lut) || (other_categories & unicode::Lut)) {
0 36 if ((first_category & unicode::Lut) || (other_categories & unicode::Lut)) {
15179 0 0 if (it != dictionary.end()) return it->second;
15184 0 36 if ((first_category & unicode::N) && !(other_categories & unicode::L)) {
0 0 if ((first_category & unicode::N) && !(other_categories & unicode::L)) {
15189 0 0 if (it != dictionary.end()) return it->second;
15200 0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
15205 0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
31 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
4 27 if (id < 0 || id * dimension >= weights.size()) return nullptr;
27 4 if (id < 0 || id * dimension >= weights.size()) return nullptr;
27 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 27 if (id < 0 || id * dimension >= weights.size()) return nullptr;
27 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
0 0 if (id < 0 || id * dimension >= weights.size()) return nullptr;
15218 4 0 for (unsigned size = data.next_4B(); size; size--) {
23 4 for (unsigned size = data.next_4B(); size; size--) {
15219 23 0 data.next_str(word);
15223 4 0 unknown_index = data.next_1B() ? dictionary.size() : -1;
4 0 unknown_index = data.next_1B() ? dictionary.size() : -1;
15226 4 0 weights.resize(dimension * (dictionary.size() + (unknown_index >= 0)));
15227 4 0 memcpy(weights.data(), data.next(weights.size()), sizeof(float) * weights.size());
15253 0 0 for (auto&& entry : dictionary) {
15254 0 0 assert(entry.second >= 0 && entry.second < int(dictionary.size()));
0 0 assert(entry.second >= 0 && entry.second < int(dictionary.size()));
15258 0 0 for (auto&& word : words)
15259 0 0 enc.add_str(word);
15261 0 0 enc.add_1B(unknown_index >= 0);
15277 0 0 for (auto&& word : words) {
15278 0 0 assert(word.second.size() == dimension);
15283 0 0 if (unknown_weights.empty()) {
15295 0 0 if (dictionary.empty()) return;
15297 0 0 assert(unknown_index < 0 || unknown_index == int(dictionary.size()));
0 0 assert(unknown_index < 0 || unknown_index == int(dictionary.size()));
15300 0 0 for (auto&& entry : dictionary) {
15304 0 0 if (unknown_index >= 0)
15353 0 0 class neural_network {
0 0 class neural_network {
2 1 class neural_network {
1 0 class neural_network {
2 1 class neural_network {
15397 367 2 for (auto&& row : m) {
15411 0 62 assert(!weights[0].empty());
15412 0 62 assert(!weights[1].empty());
15413 1116 62 for (auto&& embedding_ids : embedding_ids_sequences) if (embedding_ids) assert(embeddings.size() == embedding_ids->size());
410 706 for (auto&& embedding_ids : embedding_ids_sequences) if (embedding_ids) assert(embeddings.size() == embedding_ids->size());
0 410 for (auto&& embedding_ids : embedding_ids_sequences) if (embedding_ids) assert(embeddings.size() == embedding_ids->size());
15424 1116 62 for (unsigned sequence = 0; sequence < embedding_ids_sequences.size(); sequence++)
15425 4464 1116 for (unsigned i = 0; i < embeddings.size(); index += embeddings[i].dimension, i++)
15426 1640 2824 if (embedding_ids_sequences[sequence] && embedding_ids_sequences[sequence]->at(i) >= 0) {
0 1640 if (embedding_ids_sequences[sequence] && embedding_ids_sequences[sequence]->at(i) >= 0) {
1640 2824 if (embedding_ids_sequences[sequence] && embedding_ids_sequences[sequence]->at(i) >= 0) {
15428 1640 0 if (cache && i < cache->size() && word < cache->at(i).size()) {
1640 0 if (cache && i < cache->size() && word < cache->at(i).size()) {
0 1640 if (cache && i < cache->size() && word < cache->at(i).size()) {
1640 0 if (cache && i < cache->size() && word < cache->at(i).size()) {
15431 8200 1640 for (unsigned j = 0; j < hidden_layer_size; j++)
15436 0 0 for (unsigned j = 0; j < embeddings[i].dimension; j++)
15437 0 0 for (unsigned k = 0; k < hidden_layer_size; k++)
15441 310 62 for (unsigned i = 0; i < hidden_layer_size; i++) // Bias
15447 62 0 if (!tanh_cache.empty())
15448 310 62 for (auto&& weight : hidden_layer)
15449 310 0 weight = weight <= -10 ? -1 : weight >= 10 ? 1 : tanh_cache[int(weight * 32768 + 10 * 32768)];
310 0 weight = weight <= -10 ? -1 : weight >= 10 ? 1 : tanh_cache[int(weight * 32768 + 10 * 32768)];
15451 0 0 for (auto&& weight : hidden_layer)
15455 0 0 for (auto&& weight : hidden_layer)
15459 0 0 for (auto&& weight : hidden_layer)
15460 0 0 if (weight < 0) weight = 0;
15464 310 62 for (unsigned i = 0; i < hidden_layer_size; i++)
15465 4030 310 for (unsigned j = 0; j < outcomes_size; j++)
15467 806 62 for (unsigned i = 0; i < outcomes_size; i++) // Bias
15471 62 0 if (softmax) {
15473 62 744 for (unsigned i = 1; i < outcomes_size; i++) if (outcomes[i] > max) max = outcomes[i];
68 676 for (unsigned i = 1; i < outcomes_size; i++) if (outcomes[i] > max) max = outcomes[i];
15476 806 62 for (unsigned i = 0; i < outcomes_size; i++) sum += (outcomes[i] = exp(outcomes[i] - max));
15479 806 62 for (unsigned i = 0; i < outcomes_size; i++) outcomes[i] *= sum;
15485 655360 1 for (unsigned i = 0; i < tanh_cache.size(); i++)
15491 4 1 for (auto&& embedding : embeddings) embeddings_dim += embedding.dimension;
15494 0 1 assert(sequences * embeddings_dim + 1 == weights[0].size());
15499 4 1 for (unsigned i = 0, weight_index = 0; i < embeddings.size(); weight_index += embeddings[i].dimension, i++) {
15501 31 0 while (words < max_words && embeddings[i].weight(words)) words++;
4 27 while (words < max_words && embeddings[i].weight(words)) words++;
27 4 while (words < max_words && embeddings[i].weight(words)) words++;
15504 27 4 for (unsigned word = 0; word < words; word++) {
15508 486 27 for (unsigned sequence = 0, index = weight_index; sequence < sequences; index += embeddings_dim, sequence++)
15509 2430 486 for (unsigned j = 0; j < embeddings[i].dimension; j++)
15510 12150 2430 for (unsigned k = 0; k < hidden_layer_size; k++)
15587 0 0 struct workspace {
0 0 struct workspace {
0 0 struct workspace {
0 0 struct workspace {
0 0 struct workspace {
0 0 struct workspace {
15677 0 0 if (parameters.hidden_layer) {
15679 0 0 -parameters.initialization_range * sqrt(6.0 / float(input_size + parameters.hidden_layer));
15683 0 0 for (auto&& row : network.weights[0]) {
15685 0 0 for (auto&& weight : row)
15690 0 0 -parameters.initialization_range * sqrt(6.0 / float(output_size + parameters.hidden_layer));
15694 0 0 for (auto&& row : network.weights[1]) {
15696 0 0 for (auto&& weight : row)
15713 0 0 if (maxnorm_regularization) maxnorm_regularize();
15717 0 0 if (iteration++ >= iterations) return false;
15719 0 0 if (trainer.algorithm != network_trainer::ADADELTA)
15720 0 0 if (trainer.learning_rate != trainer.learning_rate_final && iteration > 1)
0 0 if (trainer.learning_rate != trainer.learning_rate_final && iteration > 1)
15729 0 0 if (dropout_input) {
15732 0 0 for (auto&& flag : w.input_dropout)
15736 0 0 if (dropout_hidden) {
15739 0 0 for (auto&& flag : w.hidden_dropout)
15743 0 0 for (unsigned i = 0; i < network.weights[0].front().size(); i++)
15744 0 0 if (w.hidden_dropout.empty() || !w.hidden_dropout[i])
0 0 if (w.hidden_dropout.empty() || !w.hidden_dropout[i])
0 0 if (w.hidden_dropout.empty() || !w.hidden_dropout[i])
15757 0 0 for (auto&& embedding_ids : embedding_ids_sequences)
15760 0 0 for (unsigned i = 0; i < embeddings.size(); i++) {
15761 0 0 if (embedding_ids && (*embedding_ids)[i] >= 0) {
0 0 if (embedding_ids && (*embedding_ids)[i] >= 0) {
0 0 if (embedding_ids && (*embedding_ids)[i] >= 0) {
15763 0 0 for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, embedding++, index++)
15764 0 0 if (w.input_dropout.empty() || !w.input_dropout[index])
0 0 if (w.input_dropout.empty() || !w.input_dropout[index])
0 0 if (w.input_dropout.empty() || !w.input_dropout[index])
15765 0 0 for (auto&& j : w.hidden_kept)
15771 0 0 if (dropout_input) { // Dropout normalization
15773 0 0 for (auto&& i : w.hidden_kept)
15776 0 0 for (auto&& i : w.hidden_kept) // Bias
15782 0 0 for (auto&& weight : w.hidden_layer)
15786 0 0 for (auto&& weight : w.hidden_layer)
15790 0 0 for (auto&& weight : w.hidden_layer)
15791 0 0 if (weight < 0) weight = 0;
15794 0 0 if (dropout_hidden) { // Dropout normalization
15796 0 0 for (auto&& i : w.hidden_kept)
15800 0 0 for (auto&& i : w.hidden_kept)
15801 0 0 for (unsigned j = 0; j < outcomes_size; j++)
15803 0 0 for (unsigned i = 0; i < outcomes_size; i++) // Bias
15808 0 0 for (unsigned i = 1; i < outcomes_size; i++) if (w.outcomes[i] > max) max = w.outcomes[i];
0 0 for (unsigned i = 1; i < outcomes_size; i++) if (w.outcomes[i] > max) max = w.outcomes[i];
15811 0 0 for (unsigned i = 0; i < outcomes_size; i++) sum += (w.outcomes[i] = exp(w.outcomes[i] - max));
15814 0 0 for (unsigned i = 0; i < outcomes_size; i++) w.outcomes[i] *= sum;
15861 0 0 if (network.weights[0].size() > w.weights_batch[0].size()) w.weights_batch[0].resize(network.weights[0].size());
0 0 if (network.weights[0].size() > w.weights_batch[0].size()) w.weights_batch[0].resize(network.weights[0].size());
0 0 if (network.weights[0].size() > w.weights_batch[0].size()) w.weights_batch[0].resize(network.weights[0].size());
0 0 if (network.weights[0].size() > w.weights_batch[0].size()) w.weights_batch[0].resize(network.weights[0].size());
0 0 if (network.weights[0].size() > w.weights_batch[0].size()) w.weights_batch[0].resize(network.weights[0].size());
15862 0 0 if (network.weights[1].size() > w.weights_batch[1].size()) w.weights_batch[1].resize(network.weights[1].size());
0 0 if (network.weights[1].size() > w.weights_batch[1].size()) w.weights_batch[1].resize(network.weights[1].size());
0 0 if (network.weights[1].size() > w.weights_batch[1].size()) w.weights_batch[1].resize(network.weights[1].size());
0 0 if (network.weights[1].size() > w.weights_batch[1].size()) w.weights_batch[1].resize(network.weights[1].size());
0 0 if (network.weights[1].size() > w.weights_batch[1].size()) w.weights_batch[1].resize(network.weights[1].size());
15863 0 0 if (embeddings.size() > w.error_embedding.size()) w.error_embedding.resize(embeddings.size());
0 0 if (embeddings.size() > w.error_embedding.size()) w.error_embedding.resize(embeddings.size());
0 0 if (embeddings.size() > w.error_embedding.size()) w.error_embedding.resize(embeddings.size());
0 0 if (embeddings.size() > w.error_embedding.size()) w.error_embedding.resize(embeddings.size());
0 0 if (embeddings.size() > w.error_embedding.size()) w.error_embedding.resize(embeddings.size());
15864 0 0 if (embeddings.size() > w.error_embedding_nonempty.size()) w.error_embedding_nonempty.resize(embeddings.size());
0 0 if (embeddings.size() > w.error_embedding_nonempty.size()) w.error_embedding_nonempty.resize(embeddings.size());
0 0 if (embeddings.size() > w.error_embedding_nonempty.size()) w.error_embedding_nonempty.resize(embeddings.size());
0 0 if (embeddings.size() > w.error_embedding_nonempty.size()) w.error_embedding_nonempty.resize(embeddings.size());
0 0 if (embeddings.size() > w.error_embedding_nonempty.size()) w.error_embedding_nonempty.resize(embeddings.size());
15868 0 0 if (TRAINER::need_trainer_data) {
0 0 if (TRAINER::need_trainer_data) {
0 0 if (TRAINER::need_trainer_data) {
0 0 if (TRAINER::need_trainer_data) {
0 0 if (TRAINER::need_trainer_data) {
15869 0 0 while (network.weights[0].size() > w.weights_trainer[0].size()) w.weights_trainer[0].emplace_back(network.weights[0].front().size());
0 0 while (network.weights[0].size() > w.weights_trainer[0].size()) w.weights_trainer[0].emplace_back(network.weights[0].front().size());
0 0 while (network.weights[0].size() > w.weights_trainer[0].size()) w.weights_trainer[0].emplace_back(network.weights[0].front().size());
0 0 while (network.weights[0].size() > w.weights_trainer[0].size()) w.weights_trainer[0].emplace_back(network.weights[0].front().size());
0 0 while (network.weights[0].size() > w.weights_trainer[0].size()) w.weights_trainer[0].emplace_back(network.weights[0].front().size());
15870 0 0 while (network.weights[1].size() > w.weights_trainer[1].size()) w.weights_trainer[1].emplace_back(outcomes_size);
0 0 while (network.weights[1].size() > w.weights_trainer[1].size()) w.weights_trainer[1].emplace_back(outcomes_size);
0 0 while (network.weights[1].size() > w.weights_trainer[1].size()) w.weights_trainer[1].emplace_back(outcomes_size);
0 0 while (network.weights[1].size() > w.weights_trainer[1].size()) w.weights_trainer[1].emplace_back(outcomes_size);
0 0 while (network.weights[1].size() > w.weights_trainer[1].size()) w.weights_trainer[1].emplace_back(outcomes_size);
15875 0 0 for (unsigned i = 0; i < outcomes_size; i++)
0 0 for (unsigned i = 0; i < outcomes_size; i++)
0 0 for (unsigned i = 0; i < outcomes_size; i++)
0 0 for (unsigned i = 0; i < outcomes_size; i++)
0 0 for (unsigned i = 0; i < outcomes_size; i++)
15876 0 0 w.error_outcomes[i] = (i == required_outcome) - w.outcomes[i];
0 0 w.error_outcomes[i] = (i == required_outcome) - w.outcomes[i];
0 0 w.error_outcomes[i] = (i == required_outcome) - w.outcomes[i];
0 0 w.error_outcomes[i] = (i == required_outcome) - w.outcomes[i];
0 0 w.error_outcomes[i] = (i == required_outcome) - w.outcomes[i];
15880 0 0 for (auto&& i : w.hidden_kept)
0 0 for (auto&& i : w.hidden_kept)
0 0 for (auto&& i : w.hidden_kept)
0 0 for (auto&& i : w.hidden_kept)
0 0 for (auto&& i : w.hidden_kept)
15881 0 0 for (unsigned j = 0; j < outcomes_size; j++)
0 0 for (unsigned j = 0; j < outcomes_size; j++)
0 0 for (unsigned j = 0; j < outcomes_size; j++)
0 0 for (unsigned j = 0; j < outcomes_size; j++)
0 0 for (unsigned j = 0; j < outcomes_size; j++)
15884 0 0 if (dropout_hidden) {
0 0 if (dropout_hidden) {
0 0 if (dropout_hidden) {
0 0 if (dropout_hidden) {
0 0 if (dropout_hidden) {
15886 0 0 for (auto&& i : w.hidden_kept)
0 0 for (auto&& i : w.hidden_kept)
0 0 for (auto&& i : w.hidden_kept)
0 0 for (auto&& i : w.hidden_kept)
0 0 for (auto&& i : w.hidden_kept)
15893 0 0 for (auto&& i : w.hidden_kept)
0 0 for (auto&& i : w.hidden_kept)
0 0 for (auto&& i : w.hidden_kept)
0 0 for (auto&& i : w.hidden_kept)
0 0 for (auto&& i : w.hidden_kept)
15897 0 0 for (auto&& i : w.hidden_kept) {
0 0 for (auto&& i : w.hidden_kept) {
0 0 for (auto&& i : w.hidden_kept) {
0 0 for (auto&& i : w.hidden_kept) {
0 0 for (auto&& i : w.hidden_kept) {
15903 0 0 for (auto&& i : w.hidden_kept)
0 0 for (auto&& i : w.hidden_kept)
0 0 for (auto&& i : w.hidden_kept)
0 0 for (auto&& i : w.hidden_kept)
0 0 for (auto&& i : w.hidden_kept)
15904 0 0 if (w.hidden_layer[i] <= 0)
0 0 if (w.hidden_layer[i] <= 0)
0 0 if (w.hidden_layer[i] <= 0)
0 0 if (w.hidden_layer[i] <= 0)
0 0 if (w.hidden_layer[i] <= 0)
15910 0 0 for (auto&& i : w.hidden_kept) {
0 0 for (auto&& i : w.hidden_kept) {
0 0 for (auto&& i : w.hidden_kept) {
0 0 for (auto&& i : w.hidden_kept) {
0 0 for (auto&& i : w.hidden_kept) {
15911 0 0 if (w.weights_batch[1][i].empty()) w.weights_batch[1][i].resize(outcomes_size);
0 0 if (w.weights_batch[1][i].empty()) w.weights_batch[1][i].resize(outcomes_size);
0 0 if (w.weights_batch[1][i].empty()) w.weights_batch[1][i].resize(outcomes_size);
0 0 if (w.weights_batch[1][i].empty()) w.weights_batch[1][i].resize(outcomes_size);
0 0 if (w.weights_batch[1][i].empty()) w.weights_batch[1][i].resize(outcomes_size);
15912 0 0 for (unsigned j = 0; j < outcomes_size; j++)
0 0 for (unsigned j = 0; j < outcomes_size; j++)
0 0 for (unsigned j = 0; j < outcomes_size; j++)
0 0 for (unsigned j = 0; j < outcomes_size; j++)
0 0 for (unsigned j = 0; j < outcomes_size; j++)
15916 0 0 if (w.weights_batch[1][hidden_layer_size].empty()) w.weights_batch[1][hidden_layer_size].resize(outcomes_size);
0 0 if (w.weights_batch[1][hidden_layer_size].empty()) w.weights_batch[1][hidden_layer_size].resize(outcomes_size);
0 0 if (w.weights_batch[1][hidden_layer_size].empty()) w.weights_batch[1][hidden_layer_size].resize(outcomes_size);
0 0 if (w.weights_batch[1][hidden_layer_size].empty()) w.weights_batch[1][hidden_layer_size].resize(outcomes_size);
0 0 if (w.weights_batch[1][hidden_layer_size].empty()) w.weights_batch[1][hidden_layer_size].resize(outcomes_size);
15917 0 0 for (unsigned i = 0; i < outcomes_size; i++)
0 0 for (unsigned i = 0; i < outcomes_size; i++)
0 0 for (unsigned i = 0; i < outcomes_size; i++)
0 0 for (unsigned i = 0; i < outcomes_size; i++)
0 0 for (unsigned i = 0; i < outcomes_size; i++)
15921 0 0 if (dropout_input) {
0 0 if (dropout_input) {
0 0 if (dropout_input) {
0 0 if (dropout_input) {
0 0 if (dropout_input) {
15923 0 0 for (auto&& i : w.hidden_kept)
0 0 for (auto&& i : w.hidden_kept)
0 0 for (auto&& i : w.hidden_kept)
0 0 for (auto&& i : w.hidden_kept)
0 0 for (auto&& i : w.hidden_kept)
15928 0 0 for (auto&& embedding_ids : embedding_ids_sequences)
0 0 for (auto&& embedding_ids : embedding_ids_sequences)
0 0 for (auto&& embedding_ids : embedding_ids_sequences)
0 0 for (auto&& embedding_ids : embedding_ids_sequences)
0 0 for (auto&& embedding_ids : embedding_ids_sequences)
15931 0 0 for (unsigned i = 0; i < embeddings.size(); i++) {
0 0 for (unsigned i = 0; i < embeddings.size(); i++) {
0 0 for (unsigned i = 0; i < embeddings.size(); i++) {
0 0 for (unsigned i = 0; i < embeddings.size(); i++) {
0 0 for (unsigned i = 0; i < embeddings.size(); i++) {
15932 0 0 if (embedding_ids && (*embedding_ids)[i] >= 0) {
0 0 if (embedding_ids && (*embedding_ids)[i] >= 0) {
0 0 if (embedding_ids && (*embedding_ids)[i] >= 0) {
0 0 if (embedding_ids && (*embedding_ids)[i] >= 0) {
0 0 if (embedding_ids && (*embedding_ids)[i] >= 0) {
0 0 if (embedding_ids && (*embedding_ids)[i] >= 0) {
0 0 if (embedding_ids && (*embedding_ids)[i] >= 0) {
0 0 if (embedding_ids && (*embedding_ids)[i] >= 0) {
0 0 if (embedding_ids && (*embedding_ids)[i] >= 0) {
0 0 if (embedding_ids && (*embedding_ids)[i] >= 0) {
0 0 if (embedding_ids && (*embedding_ids)[i] >= 0) {
0 0 if (embedding_ids && (*embedding_ids)[i] >= 0) {
0 0 if (embedding_ids && (*embedding_ids)[i] >= 0) {
0 0 if (embedding_ids && (*embedding_ids)[i] >= 0) {
0 0 if (embedding_ids && (*embedding_ids)[i] >= 0) {
15936 0 0 if (embeddings[i].can_update_weights(embedding_id)) {
0 0 if (embeddings[i].can_update_weights(embedding_id)) {
0 0 if (embeddings[i].can_update_weights(embedding_id)) {
0 0 if (embeddings[i].can_update_weights(embedding_id)) {
0 0 if (embeddings[i].can_update_weights(embedding_id)) {
15937 0 0 if (w.error_embedding[i].size() <= unsigned(embedding_id)) w.error_embedding[i].resize(embedding_id + 1);
0 0 if (w.error_embedding[i].size() <= unsigned(embedding_id)) w.error_embedding[i].resize(embedding_id + 1);
0 0 if (w.error_embedding[i].size() <= unsigned(embedding_id)) w.error_embedding[i].resize(embedding_id + 1);
0 0 if (w.error_embedding[i].size() <= unsigned(embedding_id)) w.error_embedding[i].resize(embedding_id + 1);
0 0 if (w.error_embedding[i].size() <= unsigned(embedding_id)) w.error_embedding[i].resize(embedding_id + 1);
0 0 if (w.error_embedding[i].size() <= unsigned(embedding_id)) w.error_embedding[i].resize(embedding_id + 1);
0 0 if (w.error_embedding[i].size() <= unsigned(embedding_id)) w.error_embedding[i].resize(embedding_id + 1);
0 0 if (w.error_embedding[i].size() <= unsigned(embedding_id)) w.error_embedding[i].resize(embedding_id + 1);
0 0 if (w.error_embedding[i].size() <= unsigned(embedding_id)) w.error_embedding[i].resize(embedding_id + 1);
0 0 if (w.error_embedding[i].size() <= unsigned(embedding_id)) w.error_embedding[i].resize(embedding_id + 1);
15938 0 0 if (w.error_embedding[i][embedding_id].empty()) {
0 0 if (w.error_embedding[i][embedding_id].empty()) {
0 0 if (w.error_embedding[i][embedding_id].empty()) {
0 0 if (w.error_embedding[i][embedding_id].empty()) {
0 0 if (w.error_embedding[i][embedding_id].empty()) {
15940 0 0 w.error_embedding_nonempty[i].emplace_back(embedding_id);
0 0 w.error_embedding_nonempty[i].emplace_back(embedding_id);
0 0 w.error_embedding_nonempty[i].emplace_back(embedding_id);
0 0 w.error_embedding_nonempty[i].emplace_back(embedding_id);
0 0 w.error_embedding_nonempty[i].emplace_back(embedding_id);
15946 0 0 for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, index++, embedding++, error_embedding += !!error_embedding)
0 0 for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, index++, embedding++, error_embedding += !!error_embedding)
0 0 for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, index++, embedding++, error_embedding += !!error_embedding)
0 0 for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, index++, embedding++, error_embedding += !!error_embedding)
0 0 for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, index++, embedding++, error_embedding += !!error_embedding)
0 0 for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, index++, embedding++, error_embedding += !!error_embedding)
0 0 for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, index++, embedding++, error_embedding += !!error_embedding)
0 0 for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, index++, embedding++, error_embedding += !!error_embedding)
0 0 for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, index++, embedding++, error_embedding += !!error_embedding)
0 0 for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, index++, embedding++, error_embedding += !!error_embedding)
15947 0 0 if (w.input_dropout.empty() || !w.input_dropout[index]) {
0 0 if (w.input_dropout.empty() || !w.input_dropout[index]) {
0 0 if (w.input_dropout.empty() || !w.input_dropout[index]) {
0 0 if (w.input_dropout.empty() || !w.input_dropout[index]) {
0 0 if (w.input_dropout.empty() || !w.input_dropout[index]) {
0 0 if (w.input_dropout.empty() || !w.input_dropout[index]) {
0 0 if (w.input_dropout.empty() || !w.input_dropout[index]) {
0 0 if (w.input_dropout.empty() || !w.input_dropout[index]) {
0 0 if (w.input_dropout.empty() || !w.input_dropout[index]) {
0 0 if (w.input_dropout.empty() || !w.input_dropout[index]) {
0 0 if (w.input_dropout.empty() || !w.input_dropout[index]) {
0 0 if (w.input_dropout.empty() || !w.input_dropout[index]) {
0 0 if (w.input_dropout.empty() || !w.input_dropout[index]) {
0 0 if (w.input_dropout.empty() || !w.input_dropout[index]) {
0 0 if (w.input_dropout.empty() || !w.input_dropout[index]) {
15948 0 0 if (error_embedding)
0 0 if (error_embedding)
0 0 if (error_embedding)
0 0 if (error_embedding)
0 0 if (error_embedding)
15949 0 0 for (auto&& j : w.hidden_kept)
0 0 for (auto&& j : w.hidden_kept)
0 0 for (auto&& j : w.hidden_kept)
0 0 for (auto&& j : w.hidden_kept)
0 0 for (auto&& j : w.hidden_kept)
15951 0 0 if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size);
0 0 if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size);
0 0 if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size);
0 0 if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size);
0 0 if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size);
0 0 if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size);
0 0 if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size);
0 0 if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size);
0 0 if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size);
0 0 if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size);
15952 0 0 for (auto&& j : w.hidden_kept)
0 0 for (auto&& j : w.hidden_kept)
0 0 for (auto&& j : w.hidden_kept)
0 0 for (auto&& j : w.hidden_kept)
0 0 for (auto&& j : w.hidden_kept)
15962 0 0 if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size);
0 0 if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size);
0 0 if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size);
0 0 if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size);
0 0 if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size);
15963 0 0 for (auto&& i : w.hidden_kept)
0 0 for (auto&& i : w.hidden_kept)
0 0 for (auto&& i : w.hidden_kept)
0 0 for (auto&& i : w.hidden_kept)
0 0 for (auto&& i : w.hidden_kept)
15968 0 0 if (++w.batch < batch_size) return;
0 0 if (++w.batch < batch_size) return;
0 0 if (++w.batch < batch_size) return;
0 0 if (++w.batch < batch_size) return;
0 0 if (++w.batch < batch_size) return;
15972 0 0 if (!network.weights[0].empty())
0 0 if (!network.weights[0].empty())
0 0 if (!network.weights[0].empty())
0 0 if (!network.weights[0].empty())
0 0 if (!network.weights[0].empty())
15973 0 0 for (int i = 0; i < 2; i++) {
0 0 for (int i = 0; i < 2; i++) {
0 0 for (int i = 0; i < 2; i++) {
0 0 for (int i = 0; i < 2; i++) {
0 0 for (int i = 0; i < 2; i++) {
15974 0 0 for (unsigned j = 0; j < w.weights_batch[i].size(); j++)
0 0 for (unsigned j = 0; j < w.weights_batch[i].size(); j++)
0 0 for (unsigned j = 0; j < w.weights_batch[i].size(); j++)
0 0 for (unsigned j = 0; j < w.weights_batch[i].size(); j++)
0 0 for (unsigned j = 0; j < w.weights_batch[i].size(); j++)
15975 0 0 if (!w.weights_batch[i][j].empty()) {
0 0 if (!w.weights_batch[i][j].empty()) {
0 0 if (!w.weights_batch[i][j].empty()) {
0 0 if (!w.weights_batch[i][j].empty()) {
0 0 if (!w.weights_batch[i][j].empty()) {
15976 0 0 for (unsigned k = 0; k < w.weights_batch[i][j].size(); k++)
0 0 for (unsigned k = 0; k < w.weights_batch[i][j].size(); k++)
0 0 for (unsigned k = 0; k < w.weights_batch[i][j].size(); k++)
0 0 for (unsigned k = 0; k < w.weights_batch[i][j].size(); k++)
0 0 for (unsigned k = 0; k < w.weights_batch[i][j].size(); k++)
15977 0 0 network.weights[i][j][k] += TRAINER::delta(w.weights_batch[i][j][k], trainer, TRAINER::need_trainer_data ? w.weights_trainer[i][j][k] : none_trainer_data) - (j+1 == w.weights_batch[i].size() ? /*bias*/ 0. : l2_regularization) * network.weights[i][j][k];
0 0 network.weights[i][j][k] += TRAINER::delta(w.weights_batch[i][j][k], trainer, TRAINER::need_trainer_data ? w.weights_trainer[i][j][k] : none_trainer_data) - (j+1 == w.weights_batch[i].size() ? /*bias*/ 0. : l2_regularization) * network.weights[i][j][k];
0 0 network.weights[i][j][k] += TRAINER::delta(w.weights_batch[i][j][k], trainer, TRAINER::need_trainer_data ? w.weights_trainer[i][j][k] : none_trainer_data) - (j+1 == w.weights_batch[i].size() ? /*bias*/ 0. : l2_regularization) * network.weights[i][j][k];
0 0 network.weights[i][j][k] += TRAINER::delta(w.weights_batch[i][j][k], trainer, TRAINER::need_trainer_data ? w.weights_trainer[i][j][k] : none_trainer_data) - (j+1 == w.weights_batch[i].size() ? /*bias*/ 0. : l2_regularization) * network.weights[i][j][k];
0 0 network.weights[i][j][k] += TRAINER::delta(w.weights_batch[i][j][k], trainer, TRAINER::need_trainer_data ? w.weights_trainer[i][j][k] : none_trainer_data) - (j+1 == w.weights_batch[i].size() ? /*bias*/ 0. : l2_regularization) * network.weights[i][j][k];
0 0 network.weights[i][j][k] += TRAINER::delta(w.weights_batch[i][j][k], trainer, TRAINER::need_trainer_data ? w.weights_trainer[i][j][k] : none_trainer_data) - (j+1 == w.weights_batch[i].size() ? /*bias*/ 0. : l2_regularization) * network.weights[i][j][k];
0 0 network.weights[i][j][k] += TRAINER::delta(w.weights_batch[i][j][k], trainer, TRAINER::need_trainer_data ? w.weights_trainer[i][j][k] : none_trainer_data) - (j+1 == w.weights_batch[i].size() ? /*bias*/ 0. : l2_regularization) * network.weights[i][j][k];
0 0 network.weights[i][j][k] += TRAINER::delta(w.weights_batch[i][j][k], trainer, TRAINER::need_trainer_data ? w.weights_trainer[i][j][k] : none_trainer_data) - (j+1 == w.weights_batch[i].size() ? /*bias*/ 0. : l2_regularization) * network.weights[i][j][k];
0 0 network.weights[i][j][k] += TRAINER::delta(w.weights_batch[i][j][k], trainer, TRAINER::need_trainer_data ? w.weights_trainer[i][j][k] : none_trainer_data) - (j+1 == w.weights_batch[i].size() ? /*bias*/ 0. : l2_regularization) * network.weights[i][j][k];
15983 0 0 for (unsigned i = 0; i < embeddings.size(); i++) {
0 0 for (unsigned i = 0; i < embeddings.size(); i++) {
0 0 for (unsigned i = 0; i < embeddings.size(); i++) {
0 0 for (unsigned i = 0; i < embeddings.size(); i++) {
0 0 for (unsigned i = 0; i < embeddings.size(); i++) {
15984 0 0 for (auto&& id : w.error_embedding_nonempty[i]) {
0 0 for (auto&& id : w.error_embedding_nonempty[i]) {
0 0 for (auto&& id : w.error_embedding_nonempty[i]) {
0 0 for (auto&& id : w.error_embedding_nonempty[i]) {
0 0 for (auto&& id : w.error_embedding_nonempty[i]) {
15985 0 0 if (TRAINER::need_trainer_data) {
0 0 if (TRAINER::need_trainer_data) {
0 0 if (TRAINER::need_trainer_data) {
0 0 if (TRAINER::need_trainer_data) {
0 0 if (TRAINER::need_trainer_data) {
15986 0 0 if (w.embedding_trainer.size() <= i) w.embedding_trainer.resize(i + 1);
0 0 if (w.embedding_trainer.size() <= i) w.embedding_trainer.resize(i + 1);
0 0 if (w.embedding_trainer.size() <= i) w.embedding_trainer.resize(i + 1);
0 0 if (w.embedding_trainer.size() <= i) w.embedding_trainer.resize(i + 1);
0 0 if (w.embedding_trainer.size() <= i) w.embedding_trainer.resize(i + 1);
15987 0 0 if (w.embedding_trainer[i].size() <= id) w.embedding_trainer[i].resize(id + 1);
0 0 if (w.embedding_trainer[i].size() <= id) w.embedding_trainer[i].resize(id + 1);
0 0 if (w.embedding_trainer[i].size() <= id) w.embedding_trainer[i].resize(id + 1);
0 0 if (w.embedding_trainer[i].size() <= id) w.embedding_trainer[i].resize(id + 1);
0 0 if (w.embedding_trainer[i].size() <= id) w.embedding_trainer[i].resize(id + 1);
15988 0 0 if (w.embedding_trainer[i][id].size() < embeddings[i].dimension) w.embedding_trainer[i][id].resize(embeddings[i].dimension);
0 0 if (w.embedding_trainer[i][id].size() < embeddings[i].dimension) w.embedding_trainer[i][id].resize(embeddings[i].dimension);
0 0 if (w.embedding_trainer[i][id].size() < embeddings[i].dimension) w.embedding_trainer[i][id].resize(embeddings[i].dimension);
0 0 if (w.embedding_trainer[i][id].size() < embeddings[i].dimension) w.embedding_trainer[i][id].resize(embeddings[i].dimension);
0 0 if (w.embedding_trainer[i][id].size() < embeddings[i].dimension) w.embedding_trainer[i][id].resize(embeddings[i].dimension);
15991 0 0 for (unsigned j = 0; j < embeddings[i].dimension; j++)
0 0 for (unsigned j = 0; j < embeddings[i].dimension; j++)
0 0 for (unsigned j = 0; j < embeddings[i].dimension; j++)
0 0 for (unsigned j = 0; j < embeddings[i].dimension; j++)
0 0 for (unsigned j = 0; j < embeddings[i].dimension; j++)
15992 0 0 embedding[j] += TRAINER::delta(w.error_embedding[i][id][j], trainer, TRAINER::need_trainer_data ? w.embedding_trainer[i][id][j] : none_trainer_data) - l2_regularization * embedding[j];
0 0 embedding[j] += TRAINER::delta(w.error_embedding[i][id][j], trainer, TRAINER::need_trainer_data ? w.embedding_trainer[i][id][j] : none_trainer_data) - l2_regularization * embedding[j];
0 0 embedding[j] += TRAINER::delta(w.error_embedding[i][id][j], trainer, TRAINER::need_trainer_data ? w.embedding_trainer[i][id][j] : none_trainer_data) - l2_regularization * embedding[j];
0 0 embedding[j] += TRAINER::delta(w.error_embedding[i][id][j], trainer, TRAINER::need_trainer_data ? w.embedding_trainer[i][id][j] : none_trainer_data) - l2_regularization * embedding[j];
15999 0 0 if (maxnorm_regularization) maxnorm_regularize();
0 0 if (maxnorm_regularization) maxnorm_regularize();
0 0 if (maxnorm_regularization) maxnorm_regularize();
0 0 if (maxnorm_regularization) maxnorm_regularize();
0 0 if (maxnorm_regularization) maxnorm_regularize();
16026 0 0 training_failure("Internal error, unsupported trainer!");
0 0 training_failure("Internal error, unsupported trainer!");
16030 0 0 if (!l1_regularization) return;
16032 0 0 for (auto&& weights : network.weights)
16033 0 0 for (unsigned i = 0; i + 1 /*ignore biases*/ < weights.size(); i++) {
16035 0 0 for (auto&& weight : row)
16036 0 0 if (weight < l1_regularization) weight += l1_regularization;
16037 0 0 else if (weight > l1_regularization) weight -= l1_regularization;
16043 0 0 if (!maxnorm_regularization) return;
16045 0 0 for (unsigned i = 0; i < 2; i++)
16046 0 0 for (unsigned j = 0; j < network.weights[i].front().size(); j++) {
16048 0 0 for (auto&& row : network.weights[i])
16051 0 0 if (length > 0 && length > maxnorm_regularization * maxnorm_regularization) {
0 0 if (length > 0 && length > maxnorm_regularization * maxnorm_regularization) {
16053 0 0 for (auto&& row : network.weights[i])
16060 0 0 if (l1_regularization) l1_regularize();
0 0 if (l1_regularization) l1_regularize();
0 0 if (l1_regularization) l1_regularize();
16065 0 0 enc.add_4B(m.empty() ? 0 : m.front().size());
16067 0 0 for (auto&& row : m) {
16068 0 0 assert(row.size() == m.front().size());
16283 1 0 struct workspace {
2 1 struct workspace {
16284 2 1 workspace(bool single_root) : conf(single_root) {}
16342 0 0 ifstream in(path_from_utf8(file).c_str(), ifstream::in | ifstream::binary);
16343 0 0 if (!in.is_open()) return nullptr;
16344 0 0 return load(in, cache);
16351 1 0 if (!compressor::load(in, data)) return nullptr;
1 0 if (!compressor::load(in, data)) return nullptr;
16355 1 0 data.next_str(name);
16357 1 0 result.reset(create(name));
16358 0 1 if (!result) return nullptr;
16360 1 0 result->load(data, cache);
0 0 result->load(data, cache);
16365 1 0 return result && data.is_end() ? result.release() : nullptr;
1 0 return result && data.is_end() ? result.release() : nullptr;
16369 1 0 if (name == "nn") return new parser_nn(false);
16370 0 0 if (name == "nn_versioned") return new parser_nn(true);
16398 1 0 if (beam_size > 1)
16401 0 0 parse_greedy(t, cost);
16405 0 0 assert(system);
16406 0 0 if (cost) *cost = 0.;
16410 0 0 if (!w) w = new workspace(single_root);
16416 0 0 if (w->embeddings.size() < t.nodes.size()) w->embeddings.resize(t.nodes.size());
16417 0 0 for (size_t i = 0; i < t.nodes.size(); i++) {
16418 0 0 if (w->embeddings[i].size() < embeddings.size()) w->embeddings[i].resize(embeddings.size());
16419 0 0 for (size_t j = 0; j < embeddings.size(); j++) {
16427 0 0 for (; !w->conf.final(); transitions++) {
16431 0 0 for (size_t i = 0; i < w->extracted_nodes.size(); i++)
16432 0 0 w->extracted_embeddings[i] = w->extracted_nodes[i] >= 0 ? &w->embeddings[w->extracted_nodes[i]] : nullptr;
16439 0 0 for (unsigned i = 0; i < w->outcomes.size(); i++)
16440 0 0 if (system->applicable(w->conf, i) && (best < 0 || w->outcomes[i] > w->outcomes[best]))
0 0 if (system->applicable(w->conf, i) && (best < 0 || w->outcomes[i] > w->outcomes[best]))
0 0 if (system->applicable(w->conf, i) && (best < 0 || w->outcomes[i] > w->outcomes[best]))
0 0 if (system->applicable(w->conf, i) && (best < 0 || w->outcomes[i] > w->outcomes[best]))
16445 0 0 if (cost) *cost += log(w->outcomes[best]);
16448 0 0 if (child >= 0)
16449 0 0 for (size_t i = 0; i < embeddings.size(); i++) {
16455 0 0 if (cost && transitions)
16463 0 1 assert(system);
16467 1 0 if (!w) w = new workspace(single_root);
16470 2 1 for (int i = 0; i < 2; i++) {
16471 2 10 while (w->bs_confs[i].size() < beam_size) w->bs_confs[i].emplace_back(single_root);
16472 0 2 while (w->bs_confs[i].size() > beam_size) w->bs_confs[i].pop_back();
16481 1 0 if (w->embeddings.size() < t.nodes.size()) w->embeddings.resize(t.nodes.size());
16482 1 0 if (w->embeddings_values.size() < t.nodes.size()) w->embeddings_values.resize(t.nodes.size());
16483 8 1 for (size_t i = 0; i < t.nodes.size(); i++) {
16484 8 0 if (w->embeddings[i].size() < embeddings.size()) w->embeddings[i].resize(embeddings.size());
16485 8 0 if (w->embeddings_values[i].size() < embeddings.size()) w->embeddings_values[i].resize(embeddings.size());
16486 32 8 for (size_t j = 0; j < embeddings.size(); j++) {
16494 1 15 for (bool all_final = false; !all_final; iteration++) {
16498 67 15 for (size_t c = 0; c < w->bs_confs_size[iteration & 1]; c++) {
16501 5 62 if (bs_conf.conf.final()) {
16502 0 5 if (w->bs_alternatives.size() == beam_size) {
16503 0 0 if (bs_conf.cost <= w->bs_alternatives[0].cost) continue;
16515 496 62 for (size_t i = 0; i < t.nodes.size(); i++)
16516 1984 496 for (size_t j = 0; j < embeddings.size(); j++) {
16518 96 1888 if (w->word != w->embeddings_values[i][j]) {
16527 1116 62 for (size_t i = 0; i < w->extracted_nodes.size(); i++)
16528 410 706 w->extracted_embeddings[i] = w->extracted_nodes[i] >= 0 ? &w->embeddings[w->extracted_nodes[i]] : nullptr;
16534 806 62 for (unsigned i = 0; i < w->outcomes.size(); i++)
16535 633 173 if (system->applicable(bs_conf.conf, i)) {
16537 567 66 if (w->bs_alternatives.size() == beam_size) {
16538 170 397 if (cost <= w->bs_alternatives[0].cost) continue;
16548 15 71 for (auto&& alternative : w->bs_alternatives) {
16552 66 5 if (alternative.transition >= 0) {
16562 4 1 for (size_t i = 1; i < w->bs_confs_size[iteration & 1]; i++)
16563 2 2 if (w->bs_confs[iteration & 1][i].cost > w->bs_confs[iteration & 1][best].cost)
16567 0 1 if (cost) *cost = w->bs_confs[iteration & 1][best].cost * (t.nodes.size() - 1);
16574 129 1032 for (auto&& node : conf.t->nodes) node.children.clear();
16575 1032 129 for (size_t i = 0; i < conf.t->nodes.size(); i++) {
16578 302 730 if (heads[i] >= 0) conf.t->nodes[heads[i]].children.push_back(i);
16583 1 66 if (conf.t->nodes.size() > heads.size()) heads.resize(conf.t->nodes.size());
16584 1 66 if (conf.t->nodes.size() > deprels.size()) deprels.resize(conf.t->nodes.size());
16585 536 67 for (size_t i = 0; i < conf.t->nodes.size(); i++) {
16594 0 1 version = versioned ? data.next_1B() : 1;
0 0 version = versioned ? data.next_1B() : 1;
16595 0 1 if (!(version >= 1 && version <= VERSION_LATEST))
16598 0 1 single_root = version >= 2 ? data.next_1B() : false;
0 0 single_root = version >= 2 ? data.next_1B() : false;
0 0 single_root = version >= 2 ? data.next_1B() : false;
16601 1 0 labels.resize(data.next_2B());
1 0 labels.resize(data.next_2B());
16602 6 1 for (auto&& label : labels)
16603 6 0 data.next_str(label);
16607 1 0 data.next_str(system_name);
16608 1 0 system.reset(transition_system::create(system_name, labels));
16609 0 1 if (!system) throw binary_decoder_error("Cannot load transition system");
16612 1 0 data.next_str(description);
16613 1 0 if (!nodes.create(description, error))
0 1 if (!nodes.create(description, error))
16617 1 0 values.resize(data.next_2B());
1 0 values.resize(data.next_2B());
16618 4 1 for (auto&& value : values) {
16619 4 0 data.next_str(description);
16620 4 0 if (!value.create(description, error))
0 4 if (!value.create(description, error))
16624 1 0 embeddings.resize(values.size());
16625 4 1 for (auto&& embedding : embeddings)
16626 4 0 embedding.load(data);
16629 1 0 network.load(data);
16630 1 0 network.generate_tanh_cache();
16631 1 0 network.generate_embeddings_cache(embeddings, embeddings_cache, cache);
16678 0 0 if (train.empty()) training_failure("No training data was given!");
0 0 if (train.empty()) training_failure("No training data was given!");
0 0 if (train.empty()) training_failure("No training data was given!");
16684 0 0 for (auto&& tree : train)
16685 0 0 for (auto&& node : tree.nodes)
16686 0 0 if (node.id) {
16687 0 0 if (node.head < 0) training_failure("The node '" << node.form << "' with id " << node.id << " has no head set!");
0 0 if (node.head < 0) training_failure("The node '" << node.form << "' with id " << node.id << " has no head set!");
0 0 if (node.head < 0) training_failure("The node '" << node.form << "' with id " << node.id << " has no head set!");
0 0 if (node.head < 0) training_failure("The node '" << node.form << "' with id " << node.id << " has no head set!");
16688 0 0 if (node.deprel.empty()) training_failure("The node '" << node.form << "' with id " << node.id << " has no deprel set!");
0 0 if (node.deprel.empty()) training_failure("The node '" << node.form << "' with id " << node.id << " has no deprel set!");
0 0 if (node.deprel.empty()) training_failure("The node '" << node.form << "' with id " << node.id << " has no deprel set!");
0 0 if (node.deprel.empty()) training_failure("The node '" << node.form << "' with id " << node.id << " has no deprel set!");
16696 0 0 for (auto&& tree : train)
16697 0 0 for (auto&& node : tree.nodes)
16698 0 0 if (node.id && !labels_set.count(node.deprel)) {
16700 0 0 parser.labels.push_back(node.deprel);
16704 0 0 if (single_root) {
16705 0 0 for (auto&& tree : train) {
16707 0 0 for (auto&& node : tree.nodes)
16708 0 0 if (node.id) {
16709 0 0 if (node.head == 0 && node.deprel != "root")
0 0 if (node.head == 0 && node.deprel != "root")
0 0 if (node.head == 0 && node.deprel != "root")
16710 0 0 training_failure("When single root is required, every root node must have 'root' deprel!");
0 0 training_failure("When single root is required, every root node must have 'root' deprel!");
16711 0 0 if (node.head != 0 && node.deprel == "root")
0 0 if (node.head != 0 && node.deprel == "root")
0 0 if (node.head != 0 && node.deprel == "root")
16712 0 0 training_failure("When single root is required, any non-root cannot have 'root' deprel!");
0 0 training_failure("When single root is required, any non-root cannot have 'root' deprel!");
16715 0 0 if (roots != 1)
16716 0 0 training_failure("When single root is required, every training tree must have single root!");
0 0 training_failure("When single root is required, every training tree must have single root!");
16720 0 0 if (!labels_set.count("root"))
0 0 if (!labels_set.count("root"))
16721 0 0 training_failure("When single root is required, the deprel 'root' must be present!");
0 0 training_failure("When single root is required, the deprel 'root' must be present!");
16722 0 0 if (labels_set.size() <= 1)
16723 0 0 training_failure("When single root is required, deprel different from 'root' must exist!");
0 0 training_failure("When single root is required, deprel different from 'root' must exist!");
16727 0 0 parser.system.reset(transition_system::create(transition_system_name, parser.labels));
16728 0 0 if (!parser.system) training_failure("Cannot create transition system '" << transition_system_name << "'!");
0 0 if (!parser.system) training_failure("Cannot create transition system '" << transition_system_name << "'!");
0 0 if (!parser.system) training_failure("Cannot create transition system '" << transition_system_name << "'!");
16730 0 0 unique_ptr oracle(parser.system->oracle(transition_oracle_name));
16731 0 0 if (!oracle) training_failure("Cannot create transition oracle '" << transition_oracle_name << "' for transition system '" << transition_system_name << "'!");
0 0 if (!oracle) training_failure("Cannot create transition oracle '" << transition_oracle_name << "' for transition system '" << transition_system_name << "'!");
0 0 if (!oracle) training_failure("Cannot create transition oracle '" << transition_oracle_name << "' for transition system '" << transition_system_name << "'!");
16735 0 0 if (!parser.nodes.create(nodes_description, error)) training_failure(error);
0 0 if (!parser.nodes.create(nodes_description, error)) training_failure(error);
0 0 if (!parser.nodes.create(nodes_description, error)) training_failure(error);
16740 0 0 split(embeddings_description, '\n', lines);
16741 0 0 for (auto&& line : lines) {
16743 0 0 if (!line.len || line.str[0] == '#') continue;
0 0 if (!line.len || line.str[0] == '#') continue;
16745 0 0 split(line, ' ', tokens);
16746 0 0 if (!(tokens.size() >= 3 && tokens.size() <= 6))
0 0 if (!(tokens.size() >= 3 && tokens.size() <= 6))
0 0 if (!(tokens.size() >= 3 && tokens.size() <= 6))
16747 0 0 training_failure("Expected 3 to 6 columns on embedding description line '" << line << "'!");
0 0 training_failure("Expected 3 to 6 columns on embedding description line '" << line << "'!");
16749 0 0 value_names.emplace_back(string(tokens[0].str, tokens[0].len));
16750 0 0 parser.values.emplace_back();
16751 0 0 if (!parser.values.back().create(tokens[0], error)) training_failure(error);
0 0 if (!parser.values.back().create(tokens[0], error)) training_failure(error);
0 0 if (!parser.values.back().create(tokens[0], error)) training_failure(error);
16753 0 0 int dimension = parse_int(tokens[1], "embedding dimension");
16754 0 0 int min_count = parse_int(tokens[2], "minimum frequency count");
16764 0 0 for (auto&& tree : train)
16765 0 0 for (auto&& node : tree.nodes)
16766 0 0 if (node.id) {
16767 0 0 parser.values.back().extract(node, word);
16772 0 0 if (tokens.size() >= 4) {
16773 0 0 int update_weights = tokens.size() >= 5 ? parse_int(tokens[4], "update weights") : 1;
0 0 int update_weights = tokens.size() >= 5 ? parse_int(tokens[4], "update weights") : 1;
16774 0 0 int max_embeddings = tokens.size() >= 6 ? parse_int(tokens[5], "maximum embeddings count") : numeric_limits::max();
0 0 int max_embeddings = tokens.size() >= 6 ? parse_int(tokens[5], "maximum embeddings count") : numeric_limits::max();
16775 0 0 ifstream in(path_from_utf8(string(tokens[3].str, tokens[3].len)).c_str());
16776 0 0 if (!in.is_open()) training_failure("Cannot load '" << tokens[0] << "' embedding from file '" << tokens[3] << "'!");
0 0 if (!in.is_open()) training_failure("Cannot load '" << tokens[0] << "' embedding from file '" << tokens[3] << "'!");
0 0 if (!in.is_open()) training_failure("Cannot load '" << tokens[0] << "' embedding from file '" << tokens[3] << "'!");
16781 0 0 if (!getline(in, line)) training_failure("Cannot read first line from embedding file '" << tokens[3] << "'!");
0 0 if (!getline(in, line)) training_failure("Cannot read first line from embedding file '" << tokens[3] << "'!");
0 0 if (!getline(in, line)) training_failure("Cannot read first line from embedding file '" << tokens[3] << "'!");
0 0 if (!getline(in, line)) training_failure("Cannot read first line from embedding file '" << tokens[3] << "'!");
16782 0 0 split(line, ' ', parts);
16783 0 0 if (parts.size() != 2) training_failure("Expected two numbers on the first line of embedding file '" << tokens[3] << "'!");
0 0 if (parts.size() != 2) training_failure("Expected two numbers on the first line of embedding file '" << tokens[3] << "'!");
0 0 if (parts.size() != 2) training_failure("Expected two numbers on the first line of embedding file '" << tokens[3] << "'!");
16784 0 0 int file_dimension = parse_int(parts[1], "embedding file dimension");
16786 0 0 if (file_dimension < dimension) training_failure("The embedding file '" << tokens[3] << "' has lower dimension than required!");
0 0 if (file_dimension < dimension) training_failure("The embedding file '" << tokens[3] << "' has lower dimension than required!");
0 0 if (file_dimension < dimension) training_failure("The embedding file '" << tokens[3] << "' has lower dimension than required!");
16790 0 0 if (file_dimension > dimension) {
16791 0 0 embeddings_from_file_comment = "[dim" + to_string(file_dimension) + "->" + to_string(dimension) + "]";
0 0 embeddings_from_file_comment = "[dim" + to_string(file_dimension) + "->" + to_string(dimension) + "]";
0 0 embeddings_from_file_comment = "[dim" + to_string(file_dimension) + "->" + to_string(dimension) + "]";
0 0 embeddings_from_file_comment = "[dim" + to_string(file_dimension) + "->" + to_string(dimension) + "]";
16794 0 0 projection.resize(dimension);
16795 0 0 for (auto&& row : projection) {
16796 0 0 row.resize(file_dimension);
16797 0 0 for (auto&& weight : row) weight = uniform(generator);
16800 0 0 for (auto&& weight : row) sum += weight;
16801 0 0 for (auto&& weight : row) weight /= sum;
16806 0 0 vector input_weights(file_dimension);
16807 0 0 vector projected_weights(dimension);
16808 0 0 while (getline(in, line) && int(weights.size()) < max_embeddings) {
0 0 while (getline(in, line) && int(weights.size()) < max_embeddings) {
0 0 while (getline(in, line) && int(weights.size()) < max_embeddings) {
0 0 while (getline(in, line) && int(weights.size()) < max_embeddings) {
16809 0 0 split(line, ' ', parts);
16810 0 0 if (!parts.empty() && !parts.back().len) parts.pop_back(); // Ignore space at the end of line
0 0 if (!parts.empty() && !parts.back().len) parts.pop_back(); // Ignore space at the end of line
0 0 if (!parts.empty() && !parts.back().len) parts.pop_back(); // Ignore space at the end of line
16811 0 0 if (int(parts.size()) != file_dimension + 1) training_failure("Wrong number of values on line '" << line << "' of embedding file '" << tokens[3]);
0 0 if (int(parts.size()) != file_dimension + 1) training_failure("Wrong number of values on line '" << line << "' of embedding file '" << tokens[3]);
16812 0 0 for (int i = 0; i < file_dimension; i++)
16813 0 0 input_weights[i] = parse_double(parts[1 + i], "embedding weight");
16818 0 0 if (update_weights == 2 && !word_counts.count(word))
16821 0 0 for (int i = 0; i < dimension; i++)
16822 0 0 if (file_dimension == dimension) {
16826 0 0 for (int j = 0; j < file_dimension; j++)
16830 0 0 if (!weights_set.count(word)) {
16831 0 0 weights.emplace_back(word, projected_weights);
16836 0 0 updatable_index = update_weights ? 0 : embeddings_from_file;
16842 0 0 for (auto&& word_count : word_counts)
16843 0 0 if (word_count.second >= min_count && !weights_set.count(word_count.first))
16844 0 0 count_words.emplace_back(word_count.second, word_count.first);
16848 0 0 vector word_weights(dimension);
16850 0 0 for (auto&& count_word : count_words) {
16851 0 0 for (auto&& word_weight : word_weights)
16854 0 0 weights.emplace_back(count_word.second, word_weights);
16859 0 0 vector unknown_weights(dimension);
16860 0 0 if (min_count > 1) {
16863 0 0 for (auto&& weight : unknown_weights)
16868 0 0 parser.embeddings.emplace_back();
16869 0 0 parser.embeddings.back().create(dimension, updatable_index, weights, unknown_weights);
16874 0 0 for (auto&& tree : train)
16875 0 0 for (auto&& node : tree.nodes)
16876 0 0 if (node.id) {
16877 0 0 parser.values.back().extract(node, word);
16879 0 0 int word_id = parser.embeddings.back().lookup_word(word, buffer);
16881 0 0 words_covered_from_file += word_id != parser.embeddings.back().unknown_word() && unsigned(word_id) < embeddings_from_file;
0 0 words_covered_from_file += word_id != parser.embeddings.back().unknown_word() && unsigned(word_id) < embeddings_from_file;
16891 0 0 for (auto&& embedding : parser.embeddings) total_dimension += embedding.dimension;
16892 0 0 for (auto&& tree : train) total_nodes += tree.nodes.size() - 1;
16896 0 0 neural_network_trainer network_trainer(parser.network, total_dimension * parser.nodes.node_count(), parser.system->transition_count(), scaled_parameters, generator);
0 0 neural_network_trainer network_trainer(parser.network, total_dimension * parser.nodes.node_count(), parser.system->transition_count(), scaled_parameters, generator);
16902 0 0 for (size_t i = 0; i < train.size(); i++)
16905 0 0 for (int iteration = 1; network_trainer.next_iteration(); iteration++) {
16922 0 0 tree t_eval;
16930 0 0 for (unsigned current_index; (current_index = atomic_index++) < permutation.size();) {
16934 0 0 conf.init(&t);
16937 0 0 if (t.nodes.size() > nodes_embeddings.size()) nodes_embeddings.resize(t.nodes.size());
0 0 if (t.nodes.size() > nodes_embeddings.size()) nodes_embeddings.resize(t.nodes.size());
16938 0 0 for (size_t i = 0; i < t.nodes.size(); i++) {
16939 0 0 nodes_embeddings[i].resize(parser.embeddings.size());
16940 0 0 for (size_t j = 0; j < parser.embeddings.size(); j++) {
16941 0 0 parser.values[j].extract(t.nodes[i], word);
16942 0 0 nodes_embeddings[i][j] = parser.embeddings[j].lookup_word(word, word_buffer);
16947 0 0 auto tree_oracle = oracle->create_tree_oracle(gold);
16950 0 0 while (!conf.final()) {
16952 0 0 parser.nodes.extract(conf, extracted_nodes);
16953 0 0 extracted_embeddings.resize(extracted_nodes.size());
16954 0 0 for (size_t i = 0; i < extracted_nodes.size(); i++)
16955 0 0 extracted_embeddings[i] = extracted_nodes[i] >= 0 ? &nodes_embeddings[extracted_nodes[i]] : nullptr;
16958 0 0 network_trainer.propagate(parser.embeddings, extracted_embeddings, workspace);
16962 0 0 for (unsigned i = 0; i < workspace.outcomes.size(); i++)
16963 0 0 if (parser.system->applicable(conf, i) && (network_best < 0 || workspace.outcomes[i] > workspace.outcomes[network_best]))
0 0 if (parser.system->applicable(conf, i) && (network_best < 0 || workspace.outcomes[i] > workspace.outcomes[network_best]))
0 0 if (parser.system->applicable(conf, i) && (network_best < 0 || workspace.outcomes[i] > workspace.outcomes[network_best]))
0 0 if (parser.system->applicable(conf, i) && (network_best < 0 || workspace.outcomes[i] > workspace.outcomes[network_best]))
0 0 if (parser.system->applicable(conf, i) && (network_best < 0 || workspace.outcomes[i] > workspace.outcomes[network_best]))
16967 0 0 auto prediction = tree_oracle->predict(conf, network_best, iteration);
16970 0 0 if (parser.system->applicable(conf, prediction.best)) {
0 0 if (parser.system->applicable(conf, prediction.best)) {
16972 0 0 if (workspace.outcomes[prediction.best])
16976 0 0 network_trainer.backpropagate(parser.embeddings, extracted_embeddings, prediction.best, workspace);
16980 0 0 if (!parser.system->applicable(conf, prediction.to_follow))
0 0 if (!parser.system->applicable(conf, prediction.to_follow))
16984 0 0 int child = parser.system->perform(conf, prediction.to_follow);
16987 0 0 if (child >= 0)
16988 0 0 for (size_t i = 0; i < parser.embeddings.size(); i++) {
16989 0 0 parser.values[i].extract(t.nodes[child], word);
16990 0 0 nodes_embeddings[child][i] = parser.embeddings[i].lookup_word(word, word_buffer);
16996 0 0 if (parameters.structured_interval && (current_index % parameters.structured_interval) == 0) {
0 0 if (parameters.structured_interval && (current_index % parameters.structured_interval) == 0) {
17001 0 0 conf.init(&t);
17004 0 0 if (t.nodes.size() > nodes_embeddings.size()) nodes_embeddings.resize(t.nodes.size());
0 0 if (t.nodes.size() > nodes_embeddings.size()) nodes_embeddings.resize(t.nodes.size());
17005 0 0 for (size_t i = 0; i < t.nodes.size(); i++) {
17006 0 0 nodes_embeddings[i].resize(parser.embeddings.size());
17007 0 0 for (size_t j = 0; j < parser.embeddings.size(); j++) {
17008 0 0 parser.values[j].extract(t.nodes[i], word);
17009 0 0 nodes_embeddings[i][j] = parser.embeddings[j].lookup_word(word, word_buffer);
17014 0 0 auto tree_oracle = oracle->create_tree_oracle(gold);
17017 0 0 while (!conf.final()) {
17019 0 0 parser.nodes.extract(conf, extracted_nodes);
17020 0 0 extracted_embeddings.resize(extracted_nodes.size());
17021 0 0 for (size_t i = 0; i < extracted_nodes.size(); i++)
17022 0 0 extracted_embeddings[i] = extracted_nodes[i] >= 0 ? &nodes_embeddings[extracted_nodes[i]] : nullptr;
17027 0 0 tree_oracle->interesting_transitions(conf, transitions_eval);
17028 0 0 for (auto&& transition : transitions_eval) {
17030 0 0 conf_eval = conf;
17032 0 0 nodes_embeddings_eval = nodes_embeddings;
17035 0 0 int child = parser.system->perform(conf_eval, transition);
17036 0 0 if (child >= 0)
17037 0 0 for (size_t i = 0; i < parser.embeddings.size(); i++) {
17038 0 0 parser.values[i].extract(t_eval.nodes[child], word);
17039 0 0 nodes_embeddings_eval[child][i] = parser.embeddings[i].lookup_word(word, word_buffer);
17043 0 0 while (!conf_eval.final()) {
17045 0 0 parser.nodes.extract(conf_eval, extracted_nodes_eval);
17046 0 0 extracted_embeddings_eval.resize(extracted_nodes_eval.size());
17047 0 0 for (size_t i = 0; i < extracted_nodes_eval.size(); i++)
17048 0 0 extracted_embeddings_eval[i] = extracted_nodes_eval[i] >= 0 ? &nodes_embeddings_eval[extracted_nodes_eval[i]] : nullptr;
17051 0 0 parser.network.propagate(parser.embeddings, extracted_embeddings_eval, hidden_layer_eval, outcomes_eval, nullptr, false);
17055 0 0 for (unsigned i = 0; i < outcomes_eval.size(); i++)
17056 0 0 if (parser.system->applicable(conf_eval, i) && (network_best < 0 || outcomes_eval[i] > outcomes_eval[network_best]))
0 0 if (parser.system->applicable(conf_eval, i) && (network_best < 0 || outcomes_eval[i] > outcomes_eval[network_best]))
0 0 if (parser.system->applicable(conf_eval, i) && (network_best < 0 || outcomes_eval[i] > outcomes_eval[network_best]))
0 0 if (parser.system->applicable(conf_eval, i) && (network_best < 0 || outcomes_eval[i] > outcomes_eval[network_best]))
0 0 if (parser.system->applicable(conf_eval, i) && (network_best < 0 || outcomes_eval[i] > outcomes_eval[network_best]))
17060 0 0 int child = parser.system->perform(conf_eval, network_best);
17063 0 0 if (child >= 0)
17064 0 0 for (size_t i = 0; i < parser.embeddings.size(); i++) {
17065 0 0 parser.values[i].extract(t_eval.nodes[child], word);
17066 0 0 nodes_embeddings_eval[child][i] = parser.embeddings[i].lookup_word(word, word_buffer);
17071 0 0 for (unsigned i = 1; i < gold.nodes.size(); i++)
17074 0 0 if (uas > best_uas) best = transition, best_uas = uas;
17078 0 0 network_trainer.propagate(parser.embeddings, extracted_embeddings, workspace);
17081 0 0 if (workspace.outcomes[best])
17083 0 0 network_trainer.backpropagate(parser.embeddings, extracted_embeddings, best, workspace);
17092 0 0 int child = parser.system->perform(conf, /*network_*/best);
17095 0 0 if (child >= 0)
17096 0 0 for (size_t i = 0; i < parser.embeddings.size(); i++) {
17097 0 0 parser.values[i].extract(t.nodes[child], word);
17098 0 0 nodes_embeddings[child][i] = parser.embeddings[i].lookup_word(word, word_buffer);
17104 0 0 for (double old_atomic_logprob = atomic_logprob; atomic_logprob.compare_exchange_weak(old_atomic_logprob, old_atomic_logprob + logprob); ) {}
17107 0 0 cerr << "Iteration " << iteration << ": ";
0 0 cerr << "Iteration " << iteration << ": ";
17108 0 0 training();
17112 0 0 if (!heldout.empty()) {
17113 0 0 tree t;
17115 0 0 for (auto&& gold : heldout) {
17119 0 0 for (size_t i = 1; i < t.nodes.size(); i++) {
17122 0 0 correct_labelled += t.nodes[i].head == gold.nodes[i].head && t.nodes[i].deprel == gold.nodes[i].deprel;
0 0 correct_labelled += t.nodes[i].head == gold.nodes[i].head && t.nodes[i].deprel == gold.nodes[i].deprel;
17126 0 0 cerr << ", heldout UAS " << fixed << setprecision(2) << (100. * correct_unlabelled / total) << "%, LAS " << (100. * correct_labelled / total) << "%";
17128 0 0 if (parameters.early_stopping && correct_labelled > heldout_best_correct_labelled) {
0 0 if (parameters.early_stopping && correct_labelled > heldout_best_correct_labelled) {
17129 0 0 heldout_best_network = parser.network;
17138 0 0 if (parameters.early_stopping && heldout_best_iteration > 0) {
0 0 if (parameters.early_stopping && heldout_best_iteration > 0) {
17140 0 0 parser.network = heldout_best_network;
17144 0 0 enc.add_1B(parser.version);
17147 0 0 enc.add_1B(single_root);
17150 0 0 enc.add_2B(parser.labels.size());
17151 0 0 for (auto&& label : parser.labels)
17152 0 0 enc.add_str(label);
17153 0 0 enc.add_str(transition_system_name);
17156 0 0 enc.add_str(nodes_description);
17159 0 0 enc.add_2B(value_names.size());
17160 0 0 for (auto&& value_name : value_names)
17161 0 0 enc.add_str(value_name);
17162 0 0 for (auto&& embedding : parser.embeddings)
17163 0 0 embedding.save(enc);
17166 0 0 network_trainer.save_network(enc);
17188 0 387 if (conf.single_root && label_is_root)
0 0 if (conf.single_root && label_is_root)
17191 351 36 return conf.stack.size() >= 2 && conf.stack[conf.stack.size() - 2];
90 261 return conf.stack.size() >= 2 && conf.stack[conf.stack.size() - 2];
17195 0 15 assert(applicable(conf));
17206 0 395 if (conf.single_root && label_is_root)
0 0 if (conf.single_root && label_is_root)
17207 0 0 return conf.stack.size() == 2 && conf.buffer.empty();
0 0 return conf.stack.size() == 2 && conf.buffer.empty();
17208 0 395 else if (conf.single_root) // && !label_is_root
17215 0 23 assert(applicable(conf));
17229 0 28 assert(applicable(conf));
17238 0 0 return conf.stack.size() >= 2 && conf.stack[conf.stack.size() - 2] && conf.stack[conf.stack.size() - 2] < conf.stack[conf.stack.size() - 1];
0 0 return conf.stack.size() >= 2 && conf.stack[conf.stack.size() - 2] && conf.stack[conf.stack.size() - 2] < conf.stack[conf.stack.size() - 1];
0 0 return conf.stack.size() >= 2 && conf.stack[conf.stack.size() - 2] && conf.stack[conf.stack.size() - 2] < conf.stack[conf.stack.size() - 1];
17242 0 0 assert(applicable(conf));
17253 0 0 if (conf.single_root && label_is_root)
0 0 if (conf.single_root && label_is_root)
17256 0 0 return conf.stack.size() >= 3 && conf.stack[conf.stack.size() - 3];
0 0 return conf.stack.size() >= 3 && conf.stack[conf.stack.size() - 3];
17260 0 0 assert(applicable(conf));
17273 0 0 if (conf.single_root && label_is_root)
0 0 if (conf.single_root && label_is_root)
17275 0 0 else if (conf.single_root) // && !label_is_root
17282 0 0 assert(applicable(conf));
17386 0 806 assert(transition < transitions.size());
17392 0 66 assert(transition < transitions.size());
17398 1 0 if (name == "projective") return new transition_system_projective(labels);
1 0 if (name == "projective") return new transition_system_projective(labels);
17399 0 0 if (name == "swap") return new transition_system_swap(labels);
0 0 if (name == "swap") return new transition_system_swap(labels);
17400 0 0 if (name == "link2") return new transition_system_link2(labels);
0 0 if (name == "link2") return new transition_system_link2(labels);
17422 0 0 transitions.emplace_back(new transition_shift());
0 0 transitions.emplace_back(new transition_shift());
17423 0 0 for (auto&& label : labels) {
17424 0 0 transitions.emplace_back(new transition_left_arc(label));
0 0 transitions.emplace_back(new transition_left_arc(label));
0 0 transitions.emplace_back(new transition_left_arc(label));
17425 0 0 transitions.emplace_back(new transition_right_arc(label));
0 0 transitions.emplace_back(new transition_right_arc(label));
0 0 transitions.emplace_back(new transition_right_arc(label));
17426 0 0 transitions.emplace_back(new transition_left_arc_2(label));
0 0 transitions.emplace_back(new transition_left_arc_2(label));
0 0 transitions.emplace_back(new transition_left_arc_2(label));
17427 0 0 transitions.emplace_back(new transition_right_arc_2(label));
0 0 transitions.emplace_back(new transition_right_arc_2(label));
0 0 transitions.emplace_back(new transition_right_arc_2(label));
17435 0 0 for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break;
0 0 for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break;
17463 0 0 if (!conf.buffer.empty()) transitions.push_back(0);
17468 0 0 for (int direction = 0; direction < 4; direction++)
17469 0 0 if (conf.stack.size() >= parents[direction] && conf.stack.size() >= children[direction]) {
0 0 if (conf.stack.size() >= parents[direction] && conf.stack.size() >= children[direction]) {
0 0 if (conf.stack.size() >= parents[direction] && conf.stack.size() >= children[direction]) {
17474 0 0 if (direction >= 2 && gold.nodes[child].head != parent) continue;
0 0 if (direction >= 2 && gold.nodes[child].head != parent) continue;
0 0 if (direction >= 2 && gold.nodes[child].head != parent) continue;
17476 0 0 for (size_t i = 0; i < labels.size(); i++)
17477 0 0 if (gold.nodes[child].deprel == labels[i])
17478 0 0 if (!conf.single_root ||
0 0 if (!conf.single_root ||
17479 0 0 (i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) ||
0 0 (i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) ||
0 0 (i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) ||
0 0 (i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) ||
17480 0 0 (i != root_label && conf.stack.size() > 2 && direction < 2) ||
0 0 (i != root_label && conf.stack.size() > 2 && direction < 2) ||
0 0 (i != root_label && conf.stack.size() > 2 && direction < 2) ||
0 0 (i != root_label && conf.stack.size() > 2 && direction < 2) ||
17481 0 0 (i != root_label && conf.stack.size() > 3 && direction >= 2))
0 0 (i != root_label && conf.stack.size() > 3 && direction >= 2))
17490 0 0 for (int direction = 0; direction < 4; direction++)
17491 0 0 if (conf.stack.size() >= parents[direction] && conf.stack.size() >= children[direction]) {
0 0 if (conf.stack.size() >= parents[direction] && conf.stack.size() >= children[direction]) {
0 0 if (conf.stack.size() >= parents[direction] && conf.stack.size() >= children[direction]) {
17495 0 0 if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) {
0 0 if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) {
0 0 if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) {
17496 0 0 for (size_t i = 0; i < labels.size(); i++)
17497 0 0 if (gold.nodes[child].deprel == labels[i])
17510 0 0 if (name == "static") return new transition_system_link2_oracle_static(labels);
0 0 if (name == "static") return new transition_system_link2_oracle_static(labels);
17532 1 0 transitions.emplace_back(new transition_shift());
1 0 transitions.emplace_back(new transition_shift());
17533 6 1 for (auto&& label : labels) {
17534 6 0 transitions.emplace_back(new transition_left_arc(label));
6 0 transitions.emplace_back(new transition_left_arc(label));
6 0 transitions.emplace_back(new transition_left_arc(label));
17535 6 0 transitions.emplace_back(new transition_right_arc(label));
6 0 transitions.emplace_back(new transition_right_arc(label));
6 0 transitions.emplace_back(new transition_right_arc(label));
17543 0 0 for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break;
0 0 for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break;
17569 0 0 if (!conf.buffer.empty()) transitions.push_back(0);
17570 0 0 if (conf.stack.size() >= 2)
17571 0 0 for (int direction = 0; direction < 2; direction++) {
17573 0 0 for (size_t i = 0; i < labels.size(); i++)
17574 0 0 if (gold.nodes[child].deprel == labels[i])
17575 0 0 if (!conf.single_root ||
0 0 if (!conf.single_root ||
17576 0 0 (i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) ||
0 0 (i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) ||
0 0 (i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) ||
0 0 (i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) ||
0 0 (i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) ||
17577 0 0 (i != root_label && conf.stack.size() > 2))
17584 0 0 if (conf.stack.size() >= 2) {
17587 0 0 if (gold.nodes[child].head == parent) {
17588 0 0 for (size_t i = 0; i < labels.size(); i++)
17589 0 0 if (gold.nodes[child].deprel == labels[i])
17597 0 0 if (conf.stack.size() >= 2) {
17600 0 0 if (gold.nodes[child].head == parent &&
0 0 if (gold.nodes[child].head == parent &&
0 0 if (gold.nodes[child].head == parent &&
17601 0 0 (conf.buffer.empty() || gold.nodes[child].children.empty() || gold.nodes[child].children.back() < conf.buffer.back())) {
0 0 (conf.buffer.empty() || gold.nodes[child].children.empty() || gold.nodes[child].children.back() < conf.buffer.back())) {
17602 0 0 for (size_t i = 0; i < labels.size(); i++)
17603 0 0 if (gold.nodes[child].deprel == labels[i])
17618 0 0 for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break;
0 0 for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break;
17648 0 0 if (iteration <= 1)
17657 0 0 if (!conf.buffer.empty()) {
17659 0 0 for (size_t i = conf.buffer.size(); i--; ) {
17662 0 0 for (auto&& child : gold.nodes[node].children)
17664 0 0 if (to_right_stack) {
17665 0 0 right_stack.push_back(node);
17672 0 0 class t_representation {
0 0 class t_representation {
0 0 class t_representation {
0 0 class t_representation {
17675 0 0 : stack(stack), right_stack(right_stack), gold(gold), labels(labels) {
0 0 : stack(stack), right_stack(right_stack), gold(gold), labels(labels) {
17676 0 0 for (int i = 0; i < 2; i++) {
17677 0 0 costs[i].reserve((stack.size() + right_stack.size()) * (stack.size() + right_stack.size()));
17678 0 0 transitions[i].reserve((stack.size() + right_stack.size()) * (stack.size() + right_stack.size()));
17680 0 0 }
0 0 }
17690 0 0 int node(unsigned i, unsigned /*j*/, unsigned h) const { return h <= i ? stack[stack.size() - 1 - i + h] : right_stack[h - i - 1]; }
0 0 int node(unsigned i, unsigned /*j*/, unsigned h) const { return h <= i ? stack[stack.size() - 1 - i + h] : right_stack[h - i - 1]; }
0 0 int node(unsigned i, unsigned /*j*/, unsigned h) const { return h <= i ? stack[stack.size() - 1 - i + h] : right_stack[h - i - 1]; }
17693 0 0 for (size_t i = 0; i < labels.size(); i++)
17694 0 0 if (gold.nodes[child].deprel == labels[i])
17706 0 0 } t(conf.stack, right_stack, gold, labels);
17708 0 0 t.prepare(0);
17710 0 0 for (unsigned diagonal = 0; diagonal < conf.stack.size() + right_stack.size(); diagonal++) {
17711 0 0 t.prepare(diagonal + 1);
17712 0 0 for (unsigned i = diagonal > right_stack.size() ? diagonal - right_stack.size() : 0; i <= diagonal && i < conf.stack.size(); i++) {
0 0 for (unsigned i = diagonal > right_stack.size() ? diagonal - right_stack.size() : 0; i <= diagonal && i < conf.stack.size(); i++) {
0 0 for (unsigned i = diagonal > right_stack.size() ? diagonal - right_stack.size() : 0; i <= diagonal && i < conf.stack.size(); i++) {
0 0 for (unsigned i = diagonal > right_stack.size() ? diagonal - right_stack.size() : 0; i <= diagonal && i < conf.stack.size(); i++) {
17716 0 0 if (i+1 < conf.stack.size())
17717 0 0 for (unsigned h = 0; h <= diagonal; h++) {
17719 0 0 if (new_node && t.cost(i, j, h) + t.edge_cost(h_node, new_node) < t.cost(i+1, j, h+1) + (t.transition(i, j, h) != 0)) {
0 0 if (new_node && t.cost(i, j, h) + t.edge_cost(h_node, new_node) < t.cost(i+1, j, h+1) + (t.transition(i, j, h) != 0)) {
0 0 if (new_node && t.cost(i, j, h) + t.edge_cost(h_node, new_node) < t.cost(i+1, j, h+1) + (t.transition(i, j, h) != 0)) {
17721 0 0 t.transition(i+1, j, h+1) = t.transition(i, j, h) >= 0 ? t.transition(i, j, h) : t.which_arc_transition(h_node, new_node);
17723 0 0 if (t.cost(i, j, h) + t.edge_cost(new_node, h_node) < t.cost(i+1, j, 0) + (t.transition(i, j, h) != 0)) {
17725 0 0 t.transition(i+1, j, 0) = t.transition(i, j, h) >= 0 ? t.transition(i, j, h) : t.which_arc_transition(new_node, h_node);
17730 0 0 if (j+1 < right_stack.size() + 1)
17731 0 0 for (unsigned h = 0; h <= diagonal; h++) {
17733 0 0 if (t.cost(i, j, h) + t.edge_cost(h_node, new_node) < t.cost(i, j+1, h) + (t.transition(i, j, h) > 0)) {
17735 0 0 t.transition(i, j+1, h) = t.transition(i, j, h) >= 0 ? t.transition(i, j, h) : 0;
17737 0 0 if (h_node && t.cost(i, j, h) + t.edge_cost(new_node, h_node) < t.cost(i, j+1, diagonal+1) + (t.transition(i, j, h) > 0)) {
0 0 if (h_node && t.cost(i, j, h) + t.edge_cost(new_node, h_node) < t.cost(i, j+1, diagonal+1) + (t.transition(i, j, h) > 0)) {
0 0 if (h_node && t.cost(i, j, h) + t.edge_cost(new_node, h_node) < t.cost(i, j+1, diagonal+1) + (t.transition(i, j, h) > 0)) {
17739 0 0 t.transition(i, j+1, diagonal+1) = t.transition(i, j, h) >= 0 ? t.transition(i, j, h) : 0;
17750 0 0 if (name == "static") return new transition_system_projective_oracle_static(labels);
0 0 if (name == "static") return new transition_system_projective_oracle_static(labels);
17751 0 0 if (name == "dynamic") return new transition_system_projective_oracle_dynamic(labels);
0 0 if (name == "dynamic") return new transition_system_projective_oracle_dynamic(labels);
17773 0 0 transitions.emplace_back(new transition_shift());
0 0 transitions.emplace_back(new transition_shift());
17774 0 0 transitions.emplace_back(new transition_swap());
0 0 transitions.emplace_back(new transition_swap());
17775 0 0 for (auto&& label : labels) {
17776 0 0 transitions.emplace_back(new transition_left_arc(label));
0 0 transitions.emplace_back(new transition_left_arc(label));
0 0 transitions.emplace_back(new transition_left_arc(label));
17777 0 0 transitions.emplace_back(new transition_right_arc(label));
0 0 transitions.emplace_back(new transition_right_arc(label));
0 0 transitions.emplace_back(new transition_right_arc(label));
17785 0 0 for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break;
0 0 for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break;
17791 0 0 : labels(labels), root_label(root_label), gold(gold), projective_order(projective_order), projective_components(projective_components) {}
0 0 : labels(labels), root_label(root_label), gold(gold), projective_order(projective_order), projective_components(projective_components) {}
17818 0 0 if (lazy) {
17819 0 0 tree_oracle_static projective_oracle(labels, root_label, gold, vector(), vector());
17822 0 0 transition_system_swap system(labels);
17824 0 0 conf.init(&t);
17825 0 0 while (!conf.final()) {
17827 0 0 if (!system.applicable(conf, prediction.to_follow)) break;
0 0 if (!system.applicable(conf, prediction.to_follow)) break;
17828 0 0 system.perform(conf, prediction.to_follow);
17832 0 0 for (auto&& node : conf.stack)
17833 0 0 if (node)
17837 0 0 return unique_ptr(new tree_oracle_static(labels, root_label, gold, move(projective_order), move(projective_components)));
0 0 return unique_ptr(new tree_oracle_static(labels, root_label, gold, move(projective_order), move(projective_components)));
17842 0 0 while (child_index < gold.nodes[node].children.size() && gold.nodes[node].children[child_index] < node)
0 0 while (child_index < gold.nodes[node].children.size() && gold.nodes[node].children[child_index] < node)
0 0 while (child_index < gold.nodes[node].children.size() && gold.nodes[node].children[child_index] < node)
17845 0 0 while (child_index < gold.nodes[node].children.size())
17851 0 0 for (auto&& child : gold.nodes[node].children)
17857 0 0 if (!conf.buffer.empty()) transitions.push_back(0);
17858 0 0 if (conf.stack.size() >= 2) {
17860 0 0 if (!projective_order.empty()) {
17863 0 0 if (projective_order[last] < projective_order[prev] &&
0 0 if (projective_order[last] < projective_order[prev] &&
0 0 if (projective_order[last] < projective_order[prev] &&
17864 0 0 (projective_components.empty() ||
17865 0 0 (conf.buffer.empty() || projective_components[last] != projective_components[conf.buffer.back()])))
17870 0 0 for (int direction = 0; direction < 2; direction++) {
17872 0 0 for (size_t i = 0; i < labels.size(); i++)
17873 0 0 if (gold.nodes[child].deprel == labels[i])
17874 0 0 if (!conf.single_root ||
0 0 if (!conf.single_root ||
17875 0 0 (i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) ||
0 0 (i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) ||
0 0 (i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) ||
0 0 (i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) ||
0 0 (i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) ||
17876 0 0 (i != root_label && conf.stack.size() > 2))
17884 0 0 if (conf.stack.size() >= 2) {
17887 0 0 if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) {
0 0 if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) {
0 0 if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) {
17888 0 0 for (size_t i = 0; i < labels.size(); i++)
17889 0 0 if (gold.nodes[child].deprel == labels[i])
17897 0 0 if (conf.stack.size() >= 2) {
17900 0 0 if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) {
0 0 if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) {
0 0 if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) {
17901 0 0 for (size_t i = 0; i < labels.size(); i++)
17902 0 0 if (gold.nodes[child].deprel == labels[i])
17910 0 0 if (conf.stack.size() >= 2 && !projective_order.empty()) {
0 0 if (conf.stack.size() >= 2 && !projective_order.empty()) {
0 0 if (conf.stack.size() >= 2 && !projective_order.empty()) {
17913 0 0 if (projective_order[last] < projective_order[prev] &&
0 0 if (projective_order[last] < projective_order[prev] &&
0 0 if (projective_order[last] < projective_order[prev] &&
17914 0 0 (projective_components.empty() ||
17915 0 0 (conf.buffer.empty() || projective_components[last] != projective_components[conf.buffer.back()])))
17925 0 0 if (name == "static_eager") return new transition_system_swap_oracle_static(labels, false);
0 0 if (name == "static_eager") return new transition_system_swap_oracle_static(labels, false);
17926 0 0 if (name == "static_lazy") return new transition_system_swap_oracle_static(labels, true);
0 0 if (name == "static_lazy") return new transition_system_swap_oracle_static(labels, true);
17950 1 0 clear();
17964 0 0 nodes.emplace_back((int)nodes.size(), form);
0 0 nodes.emplace_back((int)nodes.size(), form);
0 0 nodes.emplace_back((int)nodes.size(), form);
7 0 nodes.emplace_back((int)nodes.size(), form);
17969 38 0 assert(id >= 0 && id < int(nodes.size()));
0 38 assert(id >= 0 && id < int(nodes.size()));
17970 0 38 assert(head < int(nodes.size()));
17973 0 38 if (nodes[id].head >= 0) {
17975 0 0 for (size_t i = children.size(); i && children[i-1] >= id; i--)
0 0 for (size_t i = children.size(); i && children[i-1] >= id; i--)
0 0 for (size_t i = children.size(); i && children[i-1] >= id; i--)
17976 0 0 if (children[i-1] == id) {
17985 38 0 if (head >= 0) {
17988 27 29 while (i && children[i-1] > id) i--;
9 18 while (i && children[i-1] > id) i--;
18 38 while (i && children[i-1] > id) i--;
17989 9 29 if (!i || children[i-1] < id) children.insert(children.begin() + i, id);
9 0 if (!i || children[i-1] < id) children.insert(children.begin() + i, id);
38 0 if (!i || children[i-1] < id) children.insert(children.begin() + i, id);
17994 0 0 for (auto&& node : nodes) {
0 0 for (auto&& node : nodes) {
0 0 for (auto&& node : nodes) {
0 0 for (auto&& node : nodes) {
8 1 for (auto&& node : nodes) {
18088 0 0 const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; }
0 0 const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; }
0 0 const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; }
0 0 const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; }
0 0 const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; }
0 0 const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; }
0 0 const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; }
18118 0 0 if (name == "conllu") return new_conllu_input_format();
18128 0 0 if (name == "conllu") return new_conllu_output_format();
18156 0 0 if (make_copy) {
18171 0 0 while (text.len) {
18174 0 0 while (line.len < text.len && line.str[line.len] != '\n') line.len++;
0 0 while (line.len < text.len && line.str[line.len] != '\n') line.len++;
18179 0 0 if (!line.len) {
18180 0 0 if (t.empty()) continue;
18184 0 0 if (*line.str == '#') {
18186 0 0 if (t.empty()) comments.push_back(line);
0 0 if (t.empty()) comments.push_back(line);
18191 0 0 split(line, '\t', tokens);
18192 0 0 if (tokens.size() != 10)
18193 0 0 return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' does not contain 10 columns!") , false;
0 0 return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' does not contain 10 columns!") , false;
0 0 return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' does not contain 10 columns!") , false;
18196 0 0 if (memchr(tokens[0].str, '-', tokens[0].len)) {
18197 0 0 split(tokens[0], '-', parts);
18198 0 0 if (parts.size() != 2)
18199 0 0 return error.assign("Cannot parse ID of multiword token '").append(line.str, line.len).append("'!") , false;
0 0 return error.assign("Cannot parse ID of multiword token '").append(line.str, line.len).append("'!") , false;
0 0 return error.assign("Cannot parse ID of multiword token '").append(line.str, line.len).append("'!") , false;
18201 0 0 if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error))
0 0 if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error))
0 0 if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error))
0 0 if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error))
0 0 if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error))
18203 0 0 if (from != int(t.nodes.size()))
18204 0 0 return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false;
0 0 return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false;
0 0 return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false;
0 0 return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false;
0 0 return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false;
18205 0 0 if (to < from)
18206 0 0 return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false;
0 0 return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false;
0 0 return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false;
0 0 return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false;
0 0 return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false;
18207 0 0 if (from <= last_multiword_token)
18208 0 0 return error.assign("Multiword token '").append(line.str, line.len).append("' overlaps with the previous one!"), false;
0 0 return error.assign("Multiword token '").append(line.str, line.len).append("' overlaps with the previous one!"), false;
0 0 return error.assign("Multiword token '").append(line.str, line.len).append("' overlaps with the previous one!"), false;
18210 0 0 multiword_tokens.emplace_back(from, line);
18216 0 0 if (!parse_int(tokens[0], "CoNLL-U id", id, error))
0 0 if (!parse_int(tokens[0], "CoNLL-U id", id, error))
18218 0 0 if (id != int(t.nodes.size()))
18219 0 0 return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false;
0 0 return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false;
0 0 return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false;
0 0 return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false;
0 0 return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false;
18222 0 0 if (tokens[6].len == 1 && tokens[6].str[0] == '_') {
0 0 if (tokens[6].len == 1 && tokens[6].str[0] == '_') {
0 0 if (tokens[6].len == 1 && tokens[6].str[0] == '_') {
18225 0 0 if (!parse_int(tokens[6], "CoNLL-U head", head, error))
0 0 if (!parse_int(tokens[6], "CoNLL-U head", head, error))
18227 0 0 if (head < 0)
18228 0 0 return error.assign("Numeric head value '").append(tokens[0].str, tokens[0].len).append("' cannot be negative!"), false;
0 0 return error.assign("Numeric head value '").append(tokens[0].str, tokens[0].len).append("' cannot be negative!"), false;
0 0 return error.assign("Numeric head value '").append(tokens[0].str, tokens[0].len).append("' cannot be negative!"), false;
18233 0 0 if (!(tokens[2].len == 1 && tokens[2].str[0] == '_')) node.lemma.assign(tokens[2].str, tokens[2].len);
0 0 if (!(tokens[2].len == 1 && tokens[2].str[0] == '_')) node.lemma.assign(tokens[2].str, tokens[2].len);
0 0 if (!(tokens[2].len == 1 && tokens[2].str[0] == '_')) node.lemma.assign(tokens[2].str, tokens[2].len);
18234 0 0 if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) node.upostag.assign(tokens[3].str, tokens[3].len);
0 0 if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) node.upostag.assign(tokens[3].str, tokens[3].len);
0 0 if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) node.upostag.assign(tokens[3].str, tokens[3].len);
18235 0 0 if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) node.xpostag.assign(tokens[4].str, tokens[4].len);
0 0 if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) node.xpostag.assign(tokens[4].str, tokens[4].len);
0 0 if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) node.xpostag.assign(tokens[4].str, tokens[4].len);
18236 0 0 if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) node.feats.assign(tokens[5].str, tokens[5].len);
0 0 if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) node.feats.assign(tokens[5].str, tokens[5].len);
0 0 if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) node.feats.assign(tokens[5].str, tokens[5].len);
18238 0 0 if (!(tokens[7].len == 1 && tokens[7].str[0] == '_')) node.deprel.assign(tokens[7].str, tokens[7].len);
0 0 if (!(tokens[7].len == 1 && tokens[7].str[0] == '_')) node.deprel.assign(tokens[7].str, tokens[7].len);
0 0 if (!(tokens[7].len == 1 && tokens[7].str[0] == '_')) node.deprel.assign(tokens[7].str, tokens[7].len);
18239 0 0 if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) node.deps.assign(tokens[8].str, tokens[8].len);
0 0 if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) node.deps.assign(tokens[8].str, tokens[8].len);
0 0 if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) node.deps.assign(tokens[8].str, tokens[8].len);
18240 0 0 if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) node.misc.assign(tokens[9].str, tokens[9].len);
0 0 if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) node.misc.assign(tokens[9].str, tokens[9].len);
0 0 if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) node.misc.assign(tokens[9].str, tokens[9].len);
18244 0 0 if (last_multiword_token >= int(t.nodes.size()))
18245 0 0 return error.assign("There are words missing for multiword token '").append(multiword_tokens.back().second.str, multiword_tokens.back().second.len).append("'!"), false;
0 0 return error.assign("There are words missing for multiword token '").append(multiword_tokens.back().second.str, multiword_tokens.back().second.len).append("'!"), false;
0 0 return error.assign("There are words missing for multiword token '").append(multiword_tokens.back().second.str, multiword_tokens.back().second.len).append("'!"), false;
18248 0 0 for (auto&& node : t.nodes)
18249 0 0 if (node.id && node.head >= 0) {
0 0 if (node.id && node.head >= 0) {
18250 0 0 if (node.head >= int(t.nodes.size()))
18251 0 0 return error.assign("Node ID '").append(to_string(node.id)).append("' form '").append(node.form).append("' has too large head: '").append(to_string(node.head)).append("'!"), false;
0 0 return error.assign("Node ID '").append(to_string(node.id)).append("' form '").append(node.form).append("' has too large head: '").append(to_string(node.head)).append("'!"), false;
0 0 return error.assign("Node ID '").append(to_string(node.id)).append("' form '").append(node.form).append("' has too large head: '").append(to_string(node.head)).append("'!"), false;
0 0 return error.assign("Node ID '").append(to_string(node.id)).append("' form '").append(node.form).append("' has too large head: '").append(to_string(node.head)).append("'!"), false;
18252 0 0 t.set_head(node.id, node.head, node.deprel);
18266 0 0 auto input_conllu = dynamic_cast(additional_info);
18270 0 0 if (input_conllu)
18271 0 0 for (auto&& comment : input_conllu->comments)
18275 0 0 for (int i = 1 /*skip the root node*/; i < int(t.nodes.size()); i++) {
18277 0 0 if (input_conllu && input_conllu_multiword_tokens < input_conllu->multiword_tokens.size() &&
0 0 if (input_conllu && input_conllu_multiword_tokens < input_conllu->multiword_tokens.size() &&
0 0 if (input_conllu && input_conllu_multiword_tokens < input_conllu->multiword_tokens.size() &&
0 0 if (input_conllu && input_conllu_multiword_tokens < input_conllu->multiword_tokens.size() &&
18285 0 0 output.append(to_string(i)).push_back('\t');
18291 0 0 output.append(t.nodes[i].head < 0 ? "_" : to_string(t.nodes[i].head)).push_back('\t');
0 0 output.append(t.nodes[i].head < 0 ? "_" : to_string(t.nodes[i].head)).push_back('\t');
18348 0 0 return {1, 1, 1, "devel"};
0 0 return {1, 1, 1, "devel"};
18359 0 0 << (parsito.prerelease.empty() ? "" : "-") << parsito.prerelease
0 0 << (parsito.prerelease.empty() ? "" : "-") << parsito.prerelease
18361 0 0 << (other_libraries.empty() ? "" : " and ") << other_libraries << ")\n"
0 0 << (other_libraries.empty() ? "" : " and ") << other_libraries << ")\n"
18363 0 0 "Mathematics and Physics, Charles University in Prague, Czech Republic.";
18407 20 2 const string input_format_conllu::columns[10] = {"ID", "FORM", "LEMMA",
18408 2 0 "UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"};
2 0 "UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"};
2 0 "UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"};
2 0 "UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"};
2 0 "UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"};
2 0 "UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"};
2 0 "UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"};
2 0 "UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"};
2 0 "UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"};
2 0 "UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"};
0 0 "UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"};
18419 0 0 if (make_copy) {
18432 0 0 while (text.len) {
18435 0 0 while (line.len < text.len && (line.str[line.len] != '\r' && line.str[line.len] != '\n')) line.len++;
0 0 while (line.len < text.len && (line.str[line.len] != '\r' && line.str[line.len] != '\n')) line.len++;
18438 0 0 if (text.len >= 2 && text.str[0] == '\r' && text.str[1] == '\n')
0 0 if (text.len >= 2 && text.str[0] == '\r' && text.str[1] == '\n')
0 0 if (text.len >= 2 && text.str[0] == '\r' && text.str[1] == '\n')
18440 0 0 else if (text.len && *text.str == '\n')
0 0 else if (text.len && *text.str == '\n')
18444 0 0 if (!line.len) {
18445 0 0 if (s.empty()) continue;
18449 0 0 if (*line.str == '#') {
18451 0 0 if (s.empty()) s.comments.emplace_back(line.str, line.len);
0 0 if (s.empty()) s.comments.emplace_back(line.str, line.len);
18456 0 0 split(line, '\t', tokens);
18457 0 0 if (tokens.size() != 10)
18458 0 0 return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' does not contain 10 columns!") , false;
0 0 return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' does not contain 10 columns!") , false;
0 0 return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' does not contain 10 columns!") , false;
18461 0 0 for (int i = 0; i < 10; i++) {
18462 0 0 if (!tokens[i].len)
18463 0 0 return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' contains empty column ").append(columns[i]).append("!"), false;
0 0 return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' contains empty column ").append(columns[i]).append("!"), false;
0 0 return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' contains empty column ").append(columns[i]).append("!"), false;
0 0 return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' contains empty column ").append(columns[i]).append("!"), false;
18464 0 0 if ((version < 2 || (i != 1 && i != 2 && i != 9)) && memchr(tokens[i].str, ' ', tokens[i].len) != NULL)
0 0 if ((version < 2 || (i != 1 && i != 2 && i != 9)) && memchr(tokens[i].str, ' ', tokens[i].len) != NULL)
0 0 if ((version < 2 || (i != 1 && i != 2 && i != 9)) && memchr(tokens[i].str, ' ', tokens[i].len) != NULL)
0 0 if ((version < 2 || (i != 1 && i != 2 && i != 9)) && memchr(tokens[i].str, ' ', tokens[i].len) != NULL)
0 0 if ((version < 2 || (i != 1 && i != 2 && i != 9)) && memchr(tokens[i].str, ' ', tokens[i].len) != NULL)
18465 0 0 return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' contains spaces in column ").append(columns[i]).append("!"), false;
0 0 return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' contains spaces in column ").append(columns[i]).append("!"), false;
0 0 return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' contains spaces in column ").append(columns[i]).append("!"), false;
0 0 return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' contains spaces in column ").append(columns[i]).append("!"), false;
18469 0 0 if (memchr(tokens[0].str, '-', tokens[0].len)) {
18470 0 0 split(tokens[0], '-', parts);
18471 0 0 if (parts.size() != 2)
18472 0 0 return error.assign("Cannot parse ID of multiword token '").append(line.str, line.len).append("'!") , false;
0 0 return error.assign("Cannot parse ID of multiword token '").append(line.str, line.len).append("'!") , false;
0 0 return error.assign("Cannot parse ID of multiword token '").append(line.str, line.len).append("'!") , false;
18474 0 0 if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error))
0 0 if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error))
0 0 if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error))
0 0 if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error))
0 0 if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error))
18476 0 0 if (from != int(s.words.size()))
18477 0 0 return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false;
0 0 return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false;
0 0 return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false;
0 0 return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false;
0 0 return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false;
18478 0 0 if (to < from)
18479 0 0 return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false;
0 0 return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false;
0 0 return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false;
0 0 return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false;
0 0 return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false;
18480 0 0 if (from <= last_multiword_token)
18481 0 0 return error.assign("Multiword token '").append(line.str, line.len).append("' overlaps with the previous one!"), false;
0 0 return error.assign("Multiword token '").append(line.str, line.len).append("' overlaps with the previous one!"), false;
0 0 return error.assign("Multiword token '").append(line.str, line.len).append("' overlaps with the previous one!"), false;
18483 0 0 for (int i = 2; i < 9; i++)
18484 0 0 if (tokens[i].len != 1 || tokens[i].str[0] != '_')
0 0 if (tokens[i].len != 1 || tokens[i].str[0] != '_')
0 0 if (tokens[i].len != 1 || tokens[i].str[0] != '_')
18485 0 0 return error.assign("Column ").append(columns[i]).append(" of an multi-word token '").append(line.str, line.len).append("' is not an empty!"), false;
0 0 return error.assign("Column ").append(columns[i]).append(" of an multi-word token '").append(line.str, line.len).append("' is not an empty!"), false;
0 0 return error.assign("Column ").append(columns[i]).append(" of an multi-word token '").append(line.str, line.len).append("' is not an empty!"), false;
0 0 return error.assign("Column ").append(columns[i]).append(" of an multi-word token '").append(line.str, line.len).append("' is not an empty!"), false;
18486 0 0 s.multiword_tokens.emplace_back(from, to, tokens[1], tokens[9].len == 1 && tokens[9].str[0] == '_' ? string_piece() : tokens[9]);
0 0 s.multiword_tokens.emplace_back(from, to, tokens[1], tokens[9].len == 1 && tokens[9].str[0] == '_' ? string_piece() : tokens[9]);
0 0 s.multiword_tokens.emplace_back(from, to, tokens[1], tokens[9].len == 1 && tokens[9].str[0] == '_' ? string_piece() : tokens[9]);
18491 0 0 if (version >= 2)
18492 0 0 if (memchr(tokens[0].str, '.', tokens[0].len)) {
18493 0 0 split(tokens[0], '.', parts);
18494 0 0 if (parts.size() != 2)
18495 0 0 return error.assign("Cannot parse ID of empty node '").append(line.str, line.len).append("'!") , false;
0 0 return error.assign("Cannot parse ID of empty node '").append(line.str, line.len).append("'!") , false;
0 0 return error.assign("Cannot parse ID of empty node '").append(line.str, line.len).append("'!") , false;
18497 0 0 if (!parse_int(parts[0], "CoNLL-U empty node id", id, error) || !parse_int(parts[1], "CoNLL-U empty node index", index, error))
0 0 if (!parse_int(parts[0], "CoNLL-U empty node id", id, error) || !parse_int(parts[1], "CoNLL-U empty node index", index, error))
0 0 if (!parse_int(parts[0], "CoNLL-U empty node id", id, error) || !parse_int(parts[1], "CoNLL-U empty node index", index, error))
0 0 if (!parse_int(parts[0], "CoNLL-U empty node id", id, error) || !parse_int(parts[1], "CoNLL-U empty node index", index, error))
0 0 if (!parse_int(parts[0], "CoNLL-U empty node id", id, error) || !parse_int(parts[1], "CoNLL-U empty node index", index, error))
18499 0 0 if (id != int(s.words.size()) - 1)
18500 0 0 return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false;
0 0 return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false;
0 0 return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false;
0 0 return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false;
0 0 return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false;
18501 0 0 if (!((s.empty_nodes.empty() && index == 1) || (!s.empty_nodes.empty() && s.empty_nodes.back().id < id && index == 1) ||
0 0 if (!((s.empty_nodes.empty() && index == 1) || (!s.empty_nodes.empty() && s.empty_nodes.back().id < id && index == 1) ||
0 0 if (!((s.empty_nodes.empty() && index == 1) || (!s.empty_nodes.empty() && s.empty_nodes.back().id < id && index == 1) ||
0 0 if (!((s.empty_nodes.empty() && index == 1) || (!s.empty_nodes.empty() && s.empty_nodes.back().id < id && index == 1) ||
0 0 if (!((s.empty_nodes.empty() && index == 1) || (!s.empty_nodes.empty() && s.empty_nodes.back().id < id && index == 1) ||
18502 0 0 (!s.empty_nodes.empty() && s.empty_nodes.back().id == id && index == s.empty_nodes.back().index + 1)))
0 0 (!s.empty_nodes.empty() && s.empty_nodes.back().id == id && index == s.empty_nodes.back().index + 1)))
0 0 (!s.empty_nodes.empty() && s.empty_nodes.back().id == id && index == s.empty_nodes.back().index + 1)))
0 0 (!s.empty_nodes.empty() && s.empty_nodes.back().id == id && index == s.empty_nodes.back().index + 1)))
18503 0 0 return error.assign("Incorrect ID index '").append(parts[1].str, parts[1].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false;
0 0 return error.assign("Incorrect ID index '").append(parts[1].str, parts[1].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false;
0 0 return error.assign("Incorrect ID index '").append(parts[1].str, parts[1].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false;
0 0 return error.assign("Incorrect ID index '").append(parts[1].str, parts[1].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false;
0 0 return error.assign("Incorrect ID index '").append(parts[1].str, parts[1].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false;
18504 0 0 for (int i = 6; i < 8; i++)
18505 0 0 if (tokens[i].len != 1 || tokens[i].str[0] != '_')
0 0 if (tokens[i].len != 1 || tokens[i].str[0] != '_')
0 0 if (tokens[i].len != 1 || tokens[i].str[0] != '_')
18506 0 0 return error.assign("Column ").append(columns[i]).append(" of an empty node token '").append(line.str, line.len).append("' is not an empty!"), false;
0 0 return error.assign("Column ").append(columns[i]).append(" of an empty node token '").append(line.str, line.len).append("' is not an empty!"), false;
0 0 return error.assign("Column ").append(columns[i]).append(" of an empty node token '").append(line.str, line.len).append("' is not an empty!"), false;
0 0 return error.assign("Column ").append(columns[i]).append(" of an empty node token '").append(line.str, line.len).append("' is not an empty!"), false;
18508 0 0 s.empty_nodes.emplace_back(id, index);
18511 0 0 if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) s.empty_nodes.back().upostag.assign(tokens[3].str, tokens[3].len);
0 0 if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) s.empty_nodes.back().upostag.assign(tokens[3].str, tokens[3].len);
0 0 if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) s.empty_nodes.back().upostag.assign(tokens[3].str, tokens[3].len);
18512 0 0 if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) s.empty_nodes.back().xpostag.assign(tokens[4].str, tokens[4].len);
0 0 if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) s.empty_nodes.back().xpostag.assign(tokens[4].str, tokens[4].len);
0 0 if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) s.empty_nodes.back().xpostag.assign(tokens[4].str, tokens[4].len);
18513 0 0 if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) s.empty_nodes.back().feats.assign(tokens[5].str, tokens[5].len);
0 0 if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) s.empty_nodes.back().feats.assign(tokens[5].str, tokens[5].len);
0 0 if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) s.empty_nodes.back().feats.assign(tokens[5].str, tokens[5].len);
18514 0 0 if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) s.empty_nodes.back().deps.assign(tokens[8].str, tokens[8].len);
0 0 if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) s.empty_nodes.back().deps.assign(tokens[8].str, tokens[8].len);
0 0 if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) s.empty_nodes.back().deps.assign(tokens[8].str, tokens[8].len);
18515 0 0 if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) s.empty_nodes.back().misc.assign(tokens[9].str, tokens[9].len);
0 0 if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) s.empty_nodes.back().misc.assign(tokens[9].str, tokens[9].len);
0 0 if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) s.empty_nodes.back().misc.assign(tokens[9].str, tokens[9].len);
18521 0 0 if (!parse_int(tokens[0], "CoNLL-U id", id, error))
0 0 if (!parse_int(tokens[0], "CoNLL-U id", id, error))
18523 0 0 if (id != int(s.words.size()))
18524 0 0 return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false;
0 0 return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false;
0 0 return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false;
0 0 return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false;
0 0 return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false;
18527 0 0 if (tokens[6].len == 1 && tokens[6].str[0] == '_') {
0 0 if (tokens[6].len == 1 && tokens[6].str[0] == '_') {
0 0 if (tokens[6].len == 1 && tokens[6].str[0] == '_') {
18530 0 0 if (!parse_int(tokens[6], "CoNLL-U head", head, error))
0 0 if (!parse_int(tokens[6], "CoNLL-U head", head, error))
18532 0 0 if (head < 0)
18533 0 0 return error.assign("Numeric head value '").append(tokens[0].str, tokens[0].len).append("' cannot be negative!"), false;
0 0 return error.assign("Numeric head value '").append(tokens[0].str, tokens[0].len).append("' cannot be negative!"), false;
0 0 return error.assign("Numeric head value '").append(tokens[0].str, tokens[0].len).append("' cannot be negative!"), false;
18539 0 0 if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) word.upostag.assign(tokens[3].str, tokens[3].len);
0 0 if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) word.upostag.assign(tokens[3].str, tokens[3].len);
0 0 if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) word.upostag.assign(tokens[3].str, tokens[3].len);
18540 0 0 if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) word.xpostag.assign(tokens[4].str, tokens[4].len);
0 0 if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) word.xpostag.assign(tokens[4].str, tokens[4].len);
0 0 if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) word.xpostag.assign(tokens[4].str, tokens[4].len);
18541 0 0 if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) word.feats.assign(tokens[5].str, tokens[5].len);
0 0 if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) word.feats.assign(tokens[5].str, tokens[5].len);
0 0 if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) word.feats.assign(tokens[5].str, tokens[5].len);
18543 0 0 if (!(tokens[7].len == 1 && tokens[7].str[0] == '_')) word.deprel.assign(tokens[7].str, tokens[7].len);
0 0 if (!(tokens[7].len == 1 && tokens[7].str[0] == '_')) word.deprel.assign(tokens[7].str, tokens[7].len);
0 0 if (!(tokens[7].len == 1 && tokens[7].str[0] == '_')) word.deprel.assign(tokens[7].str, tokens[7].len);
18544 0 0 if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) word.deps.assign(tokens[8].str, tokens[8].len);
0 0 if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) word.deps.assign(tokens[8].str, tokens[8].len);
0 0 if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) word.deps.assign(tokens[8].str, tokens[8].len);
18545 0 0 if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) word.misc.assign(tokens[9].str, tokens[9].len);
0 0 if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) word.misc.assign(tokens[9].str, tokens[9].len);
0 0 if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) word.misc.assign(tokens[9].str, tokens[9].len);
18549 0 0 if (last_multiword_token >= int(s.words.size()))
18550 0 0 return error.assign("There are words missing for multiword token '").append(s.multiword_tokens.back().form).append("'!"), false;
0 0 return error.assign("There are words missing for multiword token '").append(s.multiword_tokens.back().form).append("'!"), false;
18553 0 0 for (auto&& word : s.words)
18554 0 0 if (word.id && word.head >= 0) {
0 0 if (word.id && word.head >= 0) {
18555 0 0 if (word.head >= int(s.words.size()))
18556 0 0 return error.assign("Node ID '").append(to_string(word.id)).append("' form '").append(word.form).append("' has too large head: '").append(to_string(word.head)).append("'!"), false;
0 0 return error.assign("Node ID '").append(to_string(word.id)).append("' form '").append(word.form).append("' has too large head: '").append(to_string(word.head)).append("'!"), false;
0 0 return error.assign("Node ID '").append(to_string(word.id)).append("' form '").append(word.form).append("' has too large head: '").append(to_string(word.head)).append("'!"), false;
0 0 return error.assign("Node ID '").append(to_string(word.id)).append("' form '").append(word.form).append("' has too large head: '").append(to_string(word.head)).append("'!"), false;
18557 0 0 s.set_head(word.id, word.head, word.deprel);
18581 0 0 if (getline(is, block))
18595 0 0 if (make_copy) {
18607 0 0 while (text.len && (*text.str == ' ' || *text.str == '\t' || *text.str == '\r' || *text.str == '\n')) {
0 0 while (text.len && (*text.str == ' ' || *text.str == '\t' || *text.str == '\r' || *text.str == '\n')) {
0 0 while (text.len && (*text.str == ' ' || *text.str == '\t' || *text.str == '\r' || *text.str == '\n')) {
0 0 while (text.len && (*text.str == ' ' || *text.str == '\t' || *text.str == '\r' || *text.str == '\n')) {
18613 0 0 while (text.len && *text.str != '\r' && *text.str != '\n') {
0 0 while (text.len && *text.str != '\r' && *text.str != '\n') {
0 0 while (text.len && *text.str != '\r' && *text.str != '\n') {
18617 0 0 while (text.len && *text.str != ' ' && *text.str != '\t' && *text.str != '\r' && *text.str != '\n')
0 0 while (text.len && *text.str != ' ' && *text.str != '\t' && *text.str != '\r' && *text.str != '\n')
0 0 while (text.len && *text.str != ' ' && *text.str != '\t' && *text.str != '\r' && *text.str != '\n')
0 0 while (text.len && *text.str != ' ' && *text.str != '\t' && *text.str != '\r' && *text.str != '\n')
0 0 while (text.len && *text.str != ' ' && *text.str != '\t' && *text.str != '\r' && *text.str != '\n')
18623 0 0 if (s.words.back().form.find("\302\240") != string::npos) {
18626 0 0 for (size_t i = 0; i < form.size(); i++) {
18627 0 0 if (form_len && form[form_len-1] == '\302' && form[i] == '\240')
0 0 if (form_len && form[form_len-1] == '\302' && form[i] == '\240')
0 0 if (form_len && form[form_len-1] == '\302' && form[i] == '\240')
0 0 if (form_len && form[form_len-1] == '\302' && form[i] == '\240')
18636 0 0 while (text.len && (*text.str == ' ' || *text.str == '\t'))
0 0 while (text.len && (*text.str == ' ' || *text.str == '\t'))
18640 0 0 if (!s.empty()) {
18642 0 0 if (new_document)
18647 0 0 if (preceeding_newlines >= 2)
18652 0 0 s.set_sent_id(to_string(sentence_id++));
18688 0 0 if (make_copy) {
18700 0 0 while (text.len && (*text.str == '\t' || *text.str == '\r' || *text.str == '\n')) {
0 0 while (text.len && (*text.str == '\t' || *text.str == '\r' || *text.str == '\n')) {
0 0 while (text.len && (*text.str == '\t' || *text.str == '\r' || *text.str == '\n')) {
18706 0 0 while (text.len && *text.str != '\r' && *text.str != '\n') {
0 0 while (text.len && *text.str != '\r' && *text.str != '\n') {
0 0 while (text.len && *text.str != '\r' && *text.str != '\n') {
18710 0 0 while (text.len && *text.str != '\t' && *text.str != '\r' && *text.str != '\n')
0 0 while (text.len && *text.str != '\t' && *text.str != '\r' && *text.str != '\n')
0 0 while (text.len && *text.str != '\t' && *text.str != '\r' && *text.str != '\n')
0 0 while (text.len && *text.str != '\t' && *text.str != '\r' && *text.str != '\n')
18716 0 0 while (text.len && *text.str != '\r' && *text.str != '\n')
0 0 while (text.len && *text.str != '\r' && *text.str != '\n')
0 0 while (text.len && *text.str != '\r' && *text.str != '\n')
18720 0 0 if (text.len >= 2 && text.str[0] == '\r' && text.str[1] == '\n')
0 0 if (text.len >= 2 && text.str[0] == '\r' && text.str[1] == '\n')
0 0 if (text.len >= 2 && text.str[0] == '\r' && text.str[1] == '\n')
18722 0 0 else if (text.len && *text.str == '\n')
0 0 else if (text.len && *text.str == '\n')
18726 0 0 while (text.len && *text.str == '\t')
0 0 while (text.len && *text.str == '\t')
18730 0 0 if (!s.empty()) {
18732 0 0 if (new_document)
18737 0 0 if (preceeding_newlines >= 2)
18742 0 0 s.set_sent_id(to_string(sentence_id++));
18769 0 0 if (getline(is, block))
18784 0 0 if (make_copy) {
18797 0 0 while (text.len && s.empty()) {
0 0 while (text.len && s.empty()) {
0 0 while (text.len && s.empty()) {
18800 0 0 while (line.len < text.len && (line.str[line.len] == '\n' || line.str[line.len] == '\r')) {
0 0 while (line.len < text.len && (line.str[line.len] == '\n' || line.str[line.len] == '\r')) {
18804 0 0 while (line.len < text.len && (line.str[line.len] != '\n' && line.str[line.len] != '\r'))
0 0 while (line.len < text.len && (line.str[line.len] != '\n' && line.str[line.len] != '\r'))
18806 0 0 while (line.len < text.len && (line.str[line.len] == '\n' || line.str[line.len] == '\r')) {
0 0 while (line.len < text.len && (line.str[line.len] == '\n' || line.str[line.len] == '\r')) {
18813 0 0 tokenizer->set_text(line, false);
18814 0 0 while (tokenizer->next_sentence(partial, error)) {
0 0 while (tokenizer->next_sentence(partial, error)) {
18817 0 0 for (size_t i = 1; i < partial.words.size(); i++) {
18820 0 0 if (s.words.back().head > 0) s.words.back().head += words;
18824 0 0 for (auto&& multiword_token : partial.multiword_tokens) {
18831 0 0 for (auto&& empty_node : partial.empty_nodes) {
18836 0 0 if (!error.empty()) return false;
18838 0 0 if (s.empty()) {
18844 0 0 if (!s.empty()) {
18846 0 0 if (new_document)
18847 0 0 s.set_new_doc(true, document_id);
18851 0 0 if (preceeding_newlines >= 2)
18852 0 0 s.set_new_par(true);
18856 0 0 s.set_sent_id(to_string(sentence_id++));
18859 0 0 s.comments.emplace_back("# text = ");
18860 0 0 for (size_t i = 1, j = 0; i < s.words.size(); i++) {
18861 0 0 const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j].form : (const token&)s.words[i].form;
0 0 const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j].form : (const token&)s.words[i].form;
18862 0 0 if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i))
0 0 if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i))
0 0 if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i))
18866 0 0 if (i+1 < s.words.size() && tok.get_space_after()) s.comments.back().push_back(' ');
0 0 if (i+1 < s.words.size() && tok.get_space_after()) s.comments.back().push_back(' ');
0 0 if (i+1 < s.words.size() && tok.get_space_after()) s.comments.back().push_back(' ');
0 0 if (i+1 < s.words.size() && tok.get_space_after()) s.comments.back().push_back(' ');
0 0 if (i+1 < s.words.size() && tok.get_space_after()) s.comments.back().push_back(' ');
18877 0 0 if (!named_values::parse(options, parsed_options, parse_error))
0 0 if (!named_values::parse(options, parsed_options, parse_error))
18881 0 0 if (parsed_options.count(CONLLU_V1))
18883 0 0 if (parsed_options.count(CONLLU_V2))
18886 0 0 return new input_format_conllu(version);
18892 0 0 if (!named_values::parse(options, parsed_options, parse_error))
0 0 if (!named_values::parse(options, parsed_options, parse_error))
18898 0 0 input_format* result = new morphodita_tokenizer_wrapper(morphodita::tokenizer::new_generic_tokenizer(), nullptr, normalized_spaces, token_ranges);
0 0 input_format* result = new morphodita_tokenizer_wrapper(morphodita::tokenizer::new_generic_tokenizer(), nullptr, normalized_spaces, token_ranges);
0 0 input_format* result = new morphodita_tokenizer_wrapper(morphodita::tokenizer::new_generic_tokenizer(), nullptr, normalized_spaces, token_ranges);
18899 0 0 return (parsed_options.count(GENERIC_TOKENIZER_PRESEGMENTED) && result) ? input_format::new_presegmented_tokenizer(result) : result;
0 0 return (parsed_options.count(GENERIC_TOKENIZER_PRESEGMENTED) && result) ? input_format::new_presegmented_tokenizer(result) : result;
0 0 return (parsed_options.count(GENERIC_TOKENIZER_PRESEGMENTED) && result) ? input_format::new_presegmented_tokenizer(result) : result;
18912 0 0 size_t name_len = equal != string::npos ? equal : name.size();
18913 0 0 size_t option_offset = equal != string::npos ? equal + 1 : name.size();
18915 0 0 if (name.compare(0, name_len, "conllu") == 0) return new_conllu_input_format(name.substr(option_offset));
0 0 if (name.compare(0, name_len, "conllu") == 0) return new_conllu_input_format(name.substr(option_offset));
18916 0 0 if (name.compare(0, name_len, "generic_tokenizer") == 0) return new_generic_tokenizer_input_format(name.substr(option_offset));
0 0 if (name.compare(0, name_len, "generic_tokenizer") == 0) return new_generic_tokenizer_input_format(name.substr(option_offset));
18917 0 0 if (name.compare(0, name_len, "horizontal") == 0) return new_horizontal_input_format(name.substr(option_offset));
0 0 if (name.compare(0, name_len, "horizontal") == 0) return new_horizontal_input_format(name.substr(option_offset));
18918 0 0 if (name.compare(0, name_len, "vertical") == 0) return new_vertical_input_format(name.substr(option_offset));
0 0 if (name.compare(0, name_len, "vertical") == 0) return new_vertical_input_format(name.substr(option_offset));
18966 0 0 while (str.len) {
18967 0 0 while (str.len && *str.str != '<' && *str.str != '>' && *str.str != '&' && (!data.encode_quot || *str.str != '"'))
0 0 while (str.len && *str.str != '<' && *str.str != '>' && *str.str != '&' && (!data.encode_quot || *str.str != '"'))
0 0 while (str.len && *str.str != '<' && *str.str != '>' && *str.str != '&' && (!data.encode_quot || *str.str != '"'))
0 0 while (str.len && *str.str != '<' && *str.str != '>' && *str.str != '&' && (!data.encode_quot || *str.str != '"'))
0 0 while (str.len && *str.str != '<' && *str.str != '>' && *str.str != '&' && (!data.encode_quot || *str.str != '"'))
0 0 while (str.len && *str.str != '<' && *str.str != '>' && *str.str != '&' && (!data.encode_quot || *str.str != '"'))
18970 0 0 if (str.len) {
18971 0 0 if (to_print < str.str) os.write(to_print, str.str - to_print);
18972 0 0 os << (*str.str == '<' ? "<" : *str.str == '>' ? ">" : *str.str == '&' ? "&" : """);
0 0 os << (*str.str == '<' ? "<" : *str.str == '>' ? ">" : *str.str == '&' ? "&" : """);
0 0 os << (*str.str == '<' ? "<" : *str.str == '>' ? ">" : *str.str == '&' ? "&" : """);
18978 0 0 if (to_print < str.str) os.write(to_print, str.str - to_print);
19014 0 0 const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; }
4 3 const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; }
7 0 const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; }
7 0 const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; }
7 0 const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; }
2 5 const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; }
0 7 const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; }
7 0 const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; }
0 0 const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; }
0 0 const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; }
0 0 const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; }
0 0 const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; }
0 0 const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; }
0 0 const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; }
19022 1 4 for (auto&& comment : s.comments)
19027 8 1 for (int i = 0; i < int(s.words.size()); i++) {
19029 7 1 if (i > 0) {
19031 0 7 if (multiword_token < s.multiword_tokens.size() &&
0 0 if (multiword_token < s.multiword_tokens.size() &&
0 7 if (multiword_token < s.multiword_tokens.size() &&
19047 0 7 if (s.words[i].head < 0) os << '_'; else os << s.words[i].head; os << '\t'
19054 8 0 if (version >= 2)
19055 0 8 for (; empty_node < s.empty_nodes.size() && i == s.empty_nodes[empty_node].id; empty_node++) {
0 0 for (; empty_node < s.empty_nodes.size() && i == s.empty_nodes[empty_node].id; empty_node++) {
0 8 for (; empty_node < s.empty_nodes.size() && i == s.empty_nodes[empty_node].id; empty_node++) {
19072 0 14 if (version >= 2 || str.find(' ') == string::npos)
0 0 if (version >= 2 || str.find(' ') == string::npos)
14 0 if (version >= 2 || str.find(' ') == string::npos)
19075 0 0 for (auto&& chr : str)
19076 0 0 os << (chr == ' ' ? '_' : chr);
19092 0 0 json_builder& close() { if (!stack.empty()) { json.push_back(stack.back()); stack.pop_back(); } comma_needed = true; return *this; }
0 0 json_builder& close() { if (!stack.empty()) { json.push_back(stack.back()); stack.pop_back(); } comma_needed = true; return *this; }
19094 0 0 json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; }
0 0 json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; }
0 0 json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; }
0 0 json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; }
0 0 json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; }
0 0 json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; }
0 0 json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; }
0 0 json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; }
0 0 json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; }
0 0 json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; }
0 0 json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; }
0 0 json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; }
19095 0 0 json_builder& value(size_t value) { comma(); number(value); comma_needed=true; return *this; }
0 0 json_builder& value(size_t value) { comma(); number(value); comma_needed=true; return *this; }
0 0 json_builder& value(size_t value) { comma(); number(value); comma_needed=true; return *this; }
0 0 json_builder& value(size_t value) { comma(); number(value); comma_needed=true; return *this; }
0 0 json_builder& value(size_t value) { comma(); number(value); comma_needed=true; return *this; }
0 0 json_builder& value(size_t value) { comma(); number(value); comma_needed=true; return *this; }
0 0 json_builder& value(size_t value) { comma(); number(value); comma_needed=true; return *this; }
0 0 json_builder& value(size_t value) { comma(); number(value); comma_needed=true; return *this; }
0 0 json_builder& value(size_t value) { comma(); number(value); comma_needed=true; return *this; }
0 0 json_builder& value(size_t value) { comma(); number(value); comma_needed=true; return *this; }
19103 0 0 if (comma_needed) {
19111 0 0 for (; str.len; str.str++, str.len--)
19121 0 0 if (((unsigned char)*str.str) < 32) {
19131 0 0 for (; value || start_size == json.size(); value /= 10)
0 0 for (; value || start_size == json.size(); value /= 10)
0 0 for (; value || start_size == json.size(); value /= 10)
19146 0 0 json.object().key("id").value(++sentences).key("nodes").array();
0 0 json.object().key("id").value(++sentences).key("nodes").array();
0 0 json.object().key("id").value(++sentences).key("nodes").array();
0 0 json.object().key("id").value(++sentences).key("nodes").array();
19148 0 0 for (size_t i = 1; i < s.words.size(); i++) {
19149 0 0 json.object().key("id").value(i).key("form").value(s.words[i].form);
0 0 json.object().key("id").value(i).key("form").value(s.words[i].form);
0 0 json.object().key("id").value(i).key("form").value(s.words[i].form);
19152 0 0 if (s.words[i].get_token_range(start, end))
19153 0 0 json.key("start").value(start).key("end").value(end);
0 0 json.key("start").value(start).key("end").value(end);
19154 0 0 if (s.words[i].head == 0)
19157 0 0 json.key("properties").object()
0 0 json.key("properties").object()
19158 0 0 .key("lemma").value(s.words[i].lemma)
19159 0 0 .key("upos").value(s.words[i].upostag)
19160 0 0 .key("xpos").value(s.words[i].xpostag);
19162 0 0 for (auto&& feat : feats) {
19164 0 0 while (key.len < feat.len && key.str[key.len] != '=')
0 0 while (key.len < feat.len && key.str[key.len] != '=')
19166 0 0 if (key.len + 1 < feat.len)
19167 0 0 json.key(key).value(string_piece(key.str + key.len + 1, feat.len - key.len - 1));
19171 0 0 if (!s.words[i].children.empty()) {
19173 0 0 for (auto&& child : s.words[i].children)
19174 0 0 json.object().key("label").value(s.words[child].deprel).key("target").value(child).close();
0 0 json.object().key("label").value(s.words[child].deprel).key("target").value(child).close();
0 0 json.object().key("label").value(s.words[child].deprel).key("target").value(child).close();
0 0 json.object().key("label").value(s.words[child].deprel).key("target").value(child).close();
19204 0 0 if (!sentences) {
19210 0 0 for (auto&& node : s.words[0].children)
19211 0 0 write_node(s, node, pad, os);
19226 0 0 os << pad << "
0 0 os << pad << "
19227 0 0 << "\" form=\"" << xml_encoded(s.words[node].form, true)
19228 0 0 << "\" lem=\"" << xml_encoded(s.words[node].lemma, true)
19229 0 0 << "\" mi=\"" << xml_encoded(s.words[node].feats, true)
19230 0 0 << "\" si=\"" << xml_encoded(s.words[node].deprel, true) << '"';
19232 0 0 if (s.words[node].children.empty()) {
19236 0 0 for (auto&& child : s.words[node].children)
19258 0 0 if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par()))
0 0 if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par()))
0 0 if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par()))
0 0 if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par()))
0 0 if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par()))
19263 0 0 for (size_t i = 1; i < s.words.size(); i++) {
19265 0 0 for (auto&& chr : s.words[i].form)
19266 0 0 if (chr == ' ')
19267 0 0 line.append("\302\240");
19269 0 0 line.push_back(chr);
19271 0 0 if (i+1 < s.words.size())
19272 0 0 line.push_back(' ');
19290 0 0 if (normalized) {
19291 0 0 if (!empty && (s.get_new_doc() || s.get_new_par()))
0 0 if (!empty && (s.get_new_doc() || s.get_new_par()))
0 0 if (!empty && (s.get_new_doc() || s.get_new_par()))
0 0 if (!empty && (s.get_new_doc() || s.get_new_par()))
19293 0 0 for (size_t i = 1, j = 0; i < s.words.size(); i++) {
19294 0 0 const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j] : (const token&)s.words[i];
0 0 const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j] : (const token&)s.words[i];
19296 0 0 if (i+1 < s.words.size() && tok.get_space_after())
0 0 if (i+1 < s.words.size() && tok.get_space_after())
0 0 if (i+1 < s.words.size() && tok.get_space_after())
19298 0 0 if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i))
0 0 if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i))
0 0 if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i))
19304 0 0 for (size_t i = 1, j = 0; i < s.words.size(); i++) {
19305 0 0 const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j] : (const token&)s.words[i];
0 0 const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j] : (const token&)s.words[i];
19306 0 0 tok.get_spaces_before(spaces); os << spaces;
19307 0 0 tok.get_spaces_in_token(spaces); os << (!spaces.empty() ? spaces : tok.form);
0 0 tok.get_spaces_in_token(spaces); os << (!spaces.empty() ? spaces : tok.form);
19308 0 0 tok.get_spaces_after(spaces); os << spaces;
19309 0 0 if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i))
0 0 if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i))
0 0 if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i))
19331 0 0 if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par()))
0 0 if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par()))
0 0 if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par()))
0 0 if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par()))
0 0 if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par()))
19335 0 0 for (size_t i = 1; i < s.words.size(); i++)
19344 1 0 if (!named_values::parse(options, parsed_options, parse_error))
1 0 if (!named_values::parse(options, parsed_options, parse_error))
19348 0 1 if (parsed_options.count(CONLLU_V1))
19350 0 1 if (parsed_options.count(CONLLU_V2))
19353 1 0 return new output_format_conllu(version);
19361 0 0 return new output_format_matxin();
19367 0 0 if (!named_values::parse(options, parsed_options, parse_error))
0 0 if (!named_values::parse(options, parsed_options, parse_error))
19370 0 0 return new output_format_horizontal(parsed_options.count(HORIZONTAL_PARAGRAPHS));
19376 0 0 if (!named_values::parse(options, parsed_options, parse_error))
0 0 if (!named_values::parse(options, parsed_options, parse_error))
19379 0 0 return new output_format_plaintext(parsed_options.count(PLAINTEXT_NORMALIZED_SPACES));
19385 0 0 if (!named_values::parse(options, parsed_options, parse_error))
0 0 if (!named_values::parse(options, parsed_options, parse_error))
19388 0 0 return new output_format_vertical(parsed_options.count(VERTICAL_PARAGRAPHS));
19393 1 0 size_t name_len = equal != string::npos ? equal : name.size();
19394 0 1 size_t option_offset = equal != string::npos ? equal + 1 : name.size();
19396 1 0 if (name.compare(0, name_len, "conllu") == 0) return new_conllu_output_format(name.substr(option_offset));
1 0 if (name.compare(0, name_len, "conllu") == 0) return new_conllu_output_format(name.substr(option_offset));
19397 0 0 if (name.compare(0, name_len, "epe") == 0) return new_epe_output_format(name.substr(option_offset));
0 0 if (name.compare(0, name_len, "epe") == 0) return new_epe_output_format(name.substr(option_offset));
19398 0 0 if (name.compare(0, name_len, "matxin") == 0) return new_matxin_output_format(name.substr(option_offset));
19399 0 0 if (name.compare(0, name_len, "horizontal") == 0) return new_horizontal_output_format(name.substr(option_offset));
0 0 if (name.compare(0, name_len, "horizontal") == 0) return new_horizontal_output_format(name.substr(option_offset));
19400 0 0 if (name.compare(0, name_len, "plaintext") == 0) return new_plaintext_output_format(name.substr(option_offset));
0 0 if (name.compare(0, name_len, "plaintext") == 0) return new_plaintext_output_format(name.substr(option_offset));
19401 0 0 if (name.compare(0, name_len, "vertical") == 0) return new_vertical_output_format(name.substr(option_offset));
0 0 if (name.compare(0, name_len, "vertical") == 0) return new_vertical_output_format(name.substr(option_offset));
19421 1 0 clear();
19439 0 0 words.emplace_back((int)words.size(), form);
0 0 words.emplace_back((int)words.size(), form);
0 0 words.emplace_back((int)words.size(), form);
19444 7 0 assert(id >= 0 && id < int(words.size()));
0 7 assert(id >= 0 && id < int(words.size()));
19445 0 7 assert(head < int(words.size()));
19448 0 7 if (words[id].head >= 0) {
19450 0 0 for (size_t i = children.size(); i && children[i-1] >= id; i--)
0 0 for (size_t i = children.size(); i && children[i-1] >= id; i--)
0 0 for (size_t i = children.size(); i && children[i-1] >= id; i--)
19451 0 0 if (children[i-1] == id) {
19460 7 0 if (head >= 0) {
19463 4 3 while (i && children[i-1] > id) i--;
4 0 while (i && children[i-1] > id) i--;
0 7 while (i && children[i-1] > id) i--;
19464 4 3 if (!i || children[i-1] < id) children.insert(children.begin() + i, id);
4 0 if (!i || children[i-1] < id) children.insert(children.begin() + i, id);
7 0 if (!i || children[i-1] < id) children.insert(children.begin() + i, id);
19469 0 0 for (auto&& word : words) {
19477 0 0 if (get_comment("newdoc id", id))
19486 1 0 if (new_doc && id.len)
0 1 if (new_doc && id.len)
19488 1 0 else if (new_doc)
19493 0 0 if (get_comment("newpar id", id))
19502 1 0 if (new_par && id.len)
0 1 if (new_par && id.len)
19504 1 0 else if (new_par)
19517 1 0 if (id.len)
19530 0 0 if (text.len)
19535 0 0 for (auto&& comment : comments)
19536 0 0 if (comment[0] == '#') {
19539 0 0 while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++;
0 0 while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++;
0 0 while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++;
0 0 while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++;
19542 0 0 if (j + name.len <= comment.size() && comment.compare(j, name.len, name.str, name.len) == 0) {
0 0 if (j + name.len <= comment.size() && comment.compare(j, name.len, name.str, name.len) == 0) {
0 0 if (j + name.len <= comment.size() && comment.compare(j, name.len, name.str, name.len) == 0) {
19544 0 0 while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++;
0 0 while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++;
0 0 while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++;
0 0 while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++;
19545 0 0 if (j < comment.size() && comment[j] == '=') {
0 0 if (j < comment.size() && comment[j] == '=') {
0 0 if (j < comment.size() && comment[j] == '=') {
19548 0 0 while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++;
0 0 while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++;
0 0 while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++;
0 0 while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++;
19549 0 0 if (value) value->assign(comment, j, comment.size() - j);
19552 0 0 if (value) value->clear();
19563 7 8 for (unsigned i = comments.size(); i--; )
19564 0 7 if (comments[i][0] == '#') {
19567 14 0 while (j < comments[i].size() && (comments[i][j] == ' ' || comments[i][j] == '\t')) j++;
7 7 while (j < comments[i].size() && (comments[i][j] == ' ' || comments[i][j] == '\t')) j++;
0 7 while (j < comments[i].size() && (comments[i][j] == ' ' || comments[i][j] == '\t')) j++;
7 7 while (j < comments[i].size() && (comments[i][j] == ' ' || comments[i][j] == '\t')) j++;
19570 2 5 if (j + name.len <= comments[i].size() && comments[i].compare(j, name.len, name.str, name.len) == 0)
2 0 if (j + name.len <= comments[i].size() && comments[i].compare(j, name.len, name.str, name.len) == 0)
7 0 if (j + name.len <= comments[i].size() && comments[i].compare(j, name.len, name.str, name.len) == 0)
19579 3 0 comment.append("# ").append(name.str, name.len);
3 0 comment.append("# ").append(name.str, name.len);
19580 1 2 if (value.len) {
19581 1 0 comment.append(" = ");
19582 1 1 for (size_t i = 0; i < value.len; i++)
19583 1 0 comment.push_back(value.str[i] == '\r' || value.str[i] == '\n' ? ' ' : value.str[i]);
1 0 comment.push_back(value.str[i] == '\r' || value.str[i] == '\n' ? ' ' : value.str[i]);
19602 3 8 if (form.len) this->form.assign(form.str, form.len);
19603 0 11 if (misc.len) this->misc.assign(misc.str, misc.len);
19610 2 4 return !(get_misc_field("SpaceAfter", value) && value.len == 2 && memcmp(value.str, "No", 2) == 0);
2 0 return !(get_misc_field("SpaceAfter", value) && value.len == 2 && memcmp(value.str, "No", 2) == 0);
2 0 return !(get_misc_field("SpaceAfter", value) && value.len == 2 && memcmp(value.str, "No", 2) == 0);
19614 5 2 if (space_after)
19624 0 0 if (get_misc_field("SpacesBefore", value))
19631 7 0 if (spaces_before.len == 0)
19640 0 0 if (get_misc_field("SpacesAfter", value))
19643 0 0 spaces_after.assign(get_space_after() ? " " : "");
19647 2 5 if (spaces_after.len == 0) {
19650 5 0 } else if (spaces_after.len == 1 && spaces_after.str[0] == ' ') {
5 0 } else if (spaces_after.len == 1 && spaces_after.str[0] == ' ') {
19662 0 0 if (get_misc_field("SpacesInToken", value))
19669 7 0 if (spaces_in_token.len == 0)
19679 0 0 if (!get_misc_field("TokenRange", value)) return false;
19682 0 0 while (value.len && value.str[0] >= '0' && value.str[0] <= '9') {
0 0 while (value.len && value.str[0] >= '0' && value.str[0] <= '9') {
0 0 while (value.len && value.str[0] >= '0' && value.str[0] <= '9') {
19683 0 0 if (start > (numeric_limits::max() - (value.str[0] - '0')) / 10)
19689 0 0 if (value.len == 0 || value.str[0] != ':') return false;
0 0 if (value.len == 0 || value.str[0] != ':') return false;
19693 0 0 while (value.len && value.str[0] >= '0' && value.str[0] <= '9') {
0 0 while (value.len && value.str[0] >= '0' && value.str[0] <= '9') {
0 0 while (value.len && value.str[0] >= '0' && value.str[0] <= '9') {
19694 0 0 if (end > (numeric_limits::max() - (value.str[0] - '0')) / 10)
19704 0 0 if (start == size_t(string::npos))
19707 0 0 start_misc_field("TokenRange").append(to_string(start)).append(1, ':').append(to_string(end));
19712 2 4 for (size_t index = 0; index < misc.size(); ) {
19713 2 0 if (misc.compare(index, name.len, name.str, name.len) == 0 && misc[index + name.len] == '=') {
0 2 if (misc.compare(index, name.len, name.str, name.len) == 0 && misc[index + name.len] == '=') {
2 0 if (misc.compare(index, name.len, name.str, name.len) == 0 && misc[index + name.len] == '=') {
19717 2 0 value.len = (value.len == size_t(string::npos) ? misc.size() : value.len) - index;
19721 0 0 if (index != size_t(string::npos)) index++;
19727 8 28 for (size_t index = 0; index < misc.size(); )
19728 2 6 if (misc.compare(index, name.len, name.str, name.len) == 0 && misc[index + name.len] == '=') {
0 2 if (misc.compare(index, name.len, name.str, name.len) == 0 && misc[index + name.len] == '=') {
2 6 if (misc.compare(index, name.len, name.str, name.len) == 0 && misc[index + name.len] == '=') {
19730 2 0 if (end_index == size_t(string::npos)) end_index = misc.size();
19733 0 2 if (index)
19736 2 0 misc.erase(index, end_index + (end_index < misc.size() ? 1 : 0) - index);
19739 6 0 if (index != size_t(string::npos)) index++;
19745 0 2 if (!misc.empty()) misc.push_back('|');
19751 0 0 for (unsigned i = 0; i < spaces.len; i++)
19773 0 0 for (unsigned i = 0; i < escaped_spaces.len; i++)
19774 0 0 if (escaped_spaces.str[i] != '\\' || i+1 >= escaped_spaces.len)
0 0 if (escaped_spaces.str[i] != '\\' || i+1 >= escaped_spaces.len)
19876 0 0 sa_lowercased(data_lowercased), sa_categorized(data_categorized) {}
0 0 sa_lowercased(data_lowercased), sa_categorized(data_categorized) {}
0 0 sa_lowercased(data_lowercased), sa_categorized(data_categorized) {}
19880 0 0 for (size_t i = 1, j = 0; i < s.words.size(); i++) {
19881 0 0 token* tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (token*)&s.multiword_tokens[j] : (token*)&s.words[i];
0 0 token* tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (token*)&s.multiword_tokens[j] : (token*)&s.words[i];
19883 0 0 if (previous_tok) {
19886 0 0 if (!score) score = has_letters(previous_tok->form) && has_letters(tok->form) ? -1 : 0;
0 0 if (!score) score = has_letters(previous_tok->form) && has_letters(tok->form) ? -1 : 0;
0 0 if (!score) score = has_letters(previous_tok->form) && has_letters(tok->form) ? -1 : 0;
19887 0 0 if (!score) score = only_digits(previous_tok->form) && only_digits(tok->form) ? -1 : 0;
0 0 if (!score) score = only_digits(previous_tok->form) && only_digits(tok->form) ? -1 : 0;
0 0 if (!score) score = only_digits(previous_tok->form) && only_digits(tok->form) ? -1 : 0;
19888 0 0 if (!score) score = difference(previous_tok->form, tok->form, false, LOWERCASE);
19889 0 0 if (!score) score = difference(previous_tok->form, tok->form, false, CATEGORIZE);
19890 0 0 if (!score) score = difference(previous_tok->form, tok->form, true, CATEGORIZE);
19892 0 0 if (score > 0)
19900 0 0 if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i))
0 0 if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i))
0 0 if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i))
19906 0 0 auto& func = mode == LOWERCASE ? perform_lowercase : perform_categorize;
19907 0 0 auto& sa = mode == LOWERCASE ? sa_lowercased : sa_categorized;
19910 0 0 string right_mapped = func(right);
19913 0 0 pattern.assign(separate?" ":"").append(left_mapped).append(right_mapped).append(separate?" ":"");
0 0 pattern.assign(separate?" ":"").append(left_mapped).append(right_mapped).append(separate?" ":"");
0 0 pattern.assign(separate?" ":"").append(left_mapped).append(right_mapped).append(separate?" ":"");
0 0 pattern.assign(separate?" ":"").append(left_mapped).append(right_mapped).append(separate?" ":"");
19914 0 0 int together = sa.count(pattern);
19916 0 0 pattern.assign(separate?" ":"").append(left_mapped).append(" ").append(right_mapped).append(separate?" ":"");
0 0 pattern.assign(separate?" ":"").append(left_mapped).append(" ").append(right_mapped).append(separate?" ":"");
0 0 pattern.assign(separate?" ":"").append(left_mapped).append(" ").append(right_mapped).append(separate?" ":"");
0 0 pattern.assign(separate?" ":"").append(left_mapped).append(" ").append(right_mapped).append(separate?" ":"");
0 0 pattern.assign(separate?" ":"").append(left_mapped).append(" ").append(right_mapped).append(separate?" ":"");
19917 0 0 int apart = sa.count(pattern);
19926 0 0 for (auto&& chr : utf8::decoder(input))
19927 0 0 utf8::append(output, unicode::lowercase(chr));
19935 0 0 for (auto&& chr : utf8::decoder(input)) {
19937 0 0 if (category & unicode::C) output.push_back('C');
0 0 if (category & unicode::C) output.push_back('C');
19938 0 0 if (category & unicode::L) output.push_back('L');
0 0 if (category & unicode::L) output.push_back('L');
19939 0 0 if (category & unicode::M) output.push_back('M');
0 0 if (category & unicode::M) output.push_back('M');
19940 0 0 if (category & unicode::N) output.push_back('N');
0 0 if (category & unicode::N) output.push_back('N');
19941 0 0 if (category & unicode::Pc) output.push_back('c');
0 0 if (category & unicode::Pc) output.push_back('c');
19942 0 0 if (category & unicode::Pd) output.push_back('d');
0 0 if (category & unicode::Pd) output.push_back('d');
19943 0 0 if (category & unicode::Pe) output.push_back('e');
0 0 if (category & unicode::Pe) output.push_back('e');
19944 0 0 if (category & unicode::Pf) output.push_back('f');
0 0 if (category & unicode::Pf) output.push_back('f');
19945 0 0 if (category & unicode::Pi) output.push_back('i');
0 0 if (category & unicode::Pi) output.push_back('i');
19946 0 0 if (category & unicode::Po) output.push_back('o');
0 0 if (category & unicode::Po) output.push_back('o');
19947 0 0 if (category & unicode::Ps) output.push_back('s');
0 0 if (category & unicode::Ps) output.push_back('s');
19948 0 0 if (category & unicode::S) output.push_back('S');
0 0 if (category & unicode::S) output.push_back('S');
19949 0 0 if (category & unicode::Zl) output.push_back('Z');
0 0 if (category & unicode::Zl) output.push_back('Z');
19950 0 0 if (category & unicode::Zp) output.push_back('z');
0 0 if (category & unicode::Zp) output.push_back('z');
19951 0 0 if (category & unicode::Zs) output.push_back(' ');
0 0 if (category & unicode::Zs) output.push_back(' ');
19959 0 0 for (auto&& chr : utf8::decoder(word))
19960 0 0 if (unicode::category(chr) & unicode::L)
19968 0 0 for (auto&& chr : utf8::decoder(word))
19969 0 0 if (unicode::category(chr) & ~unicode::N)
19975 0 0 sa.reserve(str.size());
19976 0 0 for (unsigned i = 0; i < str.size(); i++)
19977 0 0 sa.push_back(i);
20003 1 0 : tokenizer(tokenizer), splitter(splitter), normalized_spaces(normalized_spaces), token_ranges(token_ranges) {}
20023 0 1 for (char32_t chr;
20024 1 0 text.len && (following = text, chr = unilib::utf8::decode(following.str, following.len),
1 0 text.len && (following = text, chr = unilib::utf8::decode(following.str, following.len),
20025 1 0 (unilib::unicode::category(chr) & unilib::unicode::Zs) || chr == '\r' || chr == '\n' || chr == '\t');
1 0 (unilib::unicode::category(chr) & unilib::unicode::Zs) || chr == '\r' || chr == '\n' || chr == '\t');
20032 34 1 for (following = text; following.len; unilib::utf8::decode(following.str, following.len))
20036 1 0 if (make_copy) {
20053 0 2 if (tokenizer->next_sentence(&forms, token_ranges ? &tokens : nullptr)) {
1 1 if (tokenizer->next_sentence(&forms, token_ranges ? &tokens : nullptr)) {
20056 7 1 for (size_t i = 0; i < forms.size(); i++) {
20057 7 0 while (forms[i].len && (forms[i].str[0] == '\r' || forms[i].str[0] == '\n' ||
7 0 while (forms[i].len && (forms[i].str[0] == '\r' || forms[i].str[0] == '\n' ||
7 0 while (forms[i].len && (forms[i].str[0] == '\r' || forms[i].str[0] == '\n' ||
7 0 while (forms[i].len && (forms[i].str[0] == '\r' || forms[i].str[0] == '\n' ||
7 0 while (forms[i].len && (forms[i].str[0] == '\r' || forms[i].str[0] == '\n' ||
20058 0 7 forms[i].str[0] == '\t' || forms[i].str[0] == ' '))
20060 7 0 while (forms[i].len && (forms[i].str[forms[i].len-1] == '\r' || forms[i].str[forms[i].len-1] == '\n' ||
7 0 while (forms[i].len && (forms[i].str[forms[i].len-1] == '\r' || forms[i].str[forms[i].len-1] == '\n' ||
7 0 while (forms[i].len && (forms[i].str[forms[i].len-1] == '\r' || forms[i].str[forms[i].len-1] == '\n' ||
7 0 while (forms[i].len && (forms[i].str[forms[i].len-1] == '\r' || forms[i].str[forms[i].len-1] == '\n' ||
0 7 while (forms[i].len && (forms[i].str[forms[i].len-1] == '\r' || forms[i].str[forms[i].len-1] == '\n' ||
20061 0 7 forms[i].str[forms[i].len-1] == '\t' || forms[i].str[forms[i].len-1] == ' '))
20063 0 7 if (!forms[i].len)
20066 1 0 if (!forms.size()) return next_sentence(s, error);
20068 7 1 for (size_t i = 0; i < forms.size(); i++) {
20072 34 7 for (size_t j = 0; j < forms[i].len; j++) {
20074 34 0 if (chr == '\r' || chr == '\n' || chr == '\t') chr = ' ';
0 34 if (chr == '\r' || chr == '\n' || chr == '\t') chr = ' ';
20075 0 34 if (chr != ' ' || tok.form.empty() || tok.form.back() != ' ')
0 0 if (chr != ' ' || tok.form.empty() || tok.form.back() != ' ')
0 0 if (chr != ' ' || tok.form.empty() || tok.form.back() != ' ')
34 0 if (chr != ' ' || tok.form.empty() || tok.form.back() != ' ')
20080 1 6 if (i == 0) {
20081 0 1 if (forms[0].str > text.str)
20085 7 0 if (!normalized_spaces) {
20086 1 6 tok.set_spaces_before(i == 0 ? saved_spaces : "");
7 0 tok.set_spaces_before(i == 0 ? saved_spaces : "");
20091 1 6 if (i+1 == forms.size()) {
20096 1 1 for (char32_t chr; text.len && (following = text, chr = unilib::utf8::decode(following.str, following.len),
0 1 for (char32_t chr; text.len && (following = text, chr = unilib::utf8::decode(following.str, following.len),
1 1 for (char32_t chr; text.len && (following = text, chr = unilib::utf8::decode(following.str, following.len),
20097 0 0 (unilib::unicode::category(chr) & unilib::unicode::Zs) || chr == '\r' || chr == '\n' || chr == '\t'); text = following)
0 0 (unilib::unicode::category(chr) & unilib::unicode::Zs) || chr == '\r' || chr == '\n' || chr == '\t'); text = following)
20102 0 7 if (normalized_spaces) {
20103 0 0 tok.set_space_after(i+1 == forms.size() ? !saved_spaces.empty() : forms[i+1].str > forms[i].str + forms[i].len);
20105 0 7 tok.set_spaces_in_token(tok.form.size() != forms[i].len ? forms[i] : "");
20106 1 6 tok.set_spaces_after(i+1 == forms.size() ? saved_spaces : string_piece(forms[i].str + forms[i].len, forms[i+1].str - forms[i].str - forms[i].len));
20111 0 7 if (token_ranges)
20114 7 0 if (splitter)
20121 1 0 if (new_document) {
20127 1 0 if (preceeding_newlines >= 2)
20131 1 0 s.set_sent_id(to_string(sentence_id++));
20135 7 1 for (size_t i = 1, j = 0; i < s.words.size(); i++) {
20136 0 7 const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j].form : (const token&)s.words[i].form;
0 0 const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j].form : (const token&)s.words[i].form;
20137 0 7 if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i))
0 0 if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i))
0 7 if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i))
20141 6 1 if (i+1 < s.words.size() && tok.get_space_after()) s.comments.back().push_back(' ');
2 4 if (i+1 < s.words.size() && tok.get_space_after()) s.comments.back().push_back(' ');
3 4 if (i+1 < s.words.size() && tok.get_space_after()) s.comments.back().push_back(' ');
20148 0 1 if (text.len) {
20185 7 0 if (it == full_rules.end()) {
20186 0 7 if (version >= 2) {
20189 0 0 while (suffix.size() + 1 < buffer.size()) {
20193 0 0 if (suffix_it == suffix_rules.end())
20196 0 0 if (!suffix_it->second.words.empty()) {
20204 7 0 if (!prefix_len) {
20207 2 5 if (misc.len) s.words.back().misc.assign(misc.str, misc.len);
20215 0 0 if (unicode::category(utf8::first(token.str, token.len)) & unicode::Lut) {
20217 0 0 for (auto&& chr : utf8::decoder(token.str, token.len))
20218 0 0 if (unicode::category(chr) & (unicode::L & ~unicode::Lut)) { casing = UC_FIRST; break; }
20225 0 0 if (prefix_len) {
20228 0 0 while (s.words.back().form.size() < prefix_len && suffix.len)
0 0 while (s.words.back().form.size() < prefix_len && suffix.len)
0 0 while (s.words.back().form.size() < prefix_len && suffix.len)
20232 0 0 for (auto&& chr : utf8::decoder(it->second.words[0]))
20233 0 0 utf8::append(s.words.back().form, casing == UC_ALL || (casing == UC_FIRST && s.words.back().form.empty()) ? unicode::uppercase(chr) : chr);
0 0 utf8::append(s.words.back().form, casing == UC_ALL || (casing == UC_FIRST && s.words.back().form.empty()) ? unicode::uppercase(chr) : chr);
0 0 utf8::append(s.words.back().form, casing == UC_ALL || (casing == UC_FIRST && s.words.back().form.empty()) ? unicode::uppercase(chr) : chr);
20235 0 0 for (size_t i = 1; i < it->second.words.size(); i++)
20236 0 0 if (casing != UC_ALL) {
20246 1 0 if (!is.get(version)) return nullptr;
20247 1 0 if (!(version >= 1 && version <= VERSION_LATEST)) return nullptr;
20250 1 0 if (!compressor::load(is, data)) return nullptr;
1 0 if (!compressor::load(is, data)) return nullptr;
20252 1 0 unique_ptr splitter(new multiword_splitter(version));
20254 1 0 for (unsigned full_rules = data.next_4B(); full_rules; full_rules--) {
0 1 for (unsigned full_rules = data.next_4B(); full_rules; full_rules--) {
20256 0 0 data.next_str(full_rule);
20261 0 0 for (unsigned words = data.next_1B(); words; words--) {
0 0 for (unsigned words = data.next_1B(); words; words--) {
20262 0 0 info.words.emplace_back();
20263 0 0 data.next_str(info.words.back());
20265 0 0 if (info.words.empty()) return nullptr;
20268 0 1 if (version >= 2)
20269 0 0 for (unsigned suffix_rules = data.next_4B(); suffix_rules; suffix_rules--) {
0 0 for (unsigned suffix_rules = data.next_4B(); suffix_rules; suffix_rules--) {
20271 0 0 data.next_str(suffix_rule);
20276 0 0 for (unsigned words = data.next_1B(); words; words--) {
0 0 for (unsigned words = data.next_1B(); words; words--) {
20277 0 0 info.words.emplace_back();
20278 0 0 data.next_str(info.words.back());
20280 0 0 if (info.words.empty()) return nullptr;
20283 0 0 if (!suffix_rule.empty())
20284 0 0 for (suffix_rule.pop_back(); !suffix_rule.empty(); suffix_rule.pop_back())
20286 0 0 }
20291 1 0 return data.is_end() ? splitter.release() : nullptr;
20339 0 0 for (auto&& sentence : data)
20340 0 0 for (auto&& multiword : sentence.multiword_tokens) {
20343 0 0 for (int i = multiword.id_first; i <= multiword.id_last; i++)
20344 0 0 utf8::map(unicode::lowercase, sentence.words[i].form, (lc_words.emplace_back(), lc_words.back()));
20346 0 0 auto& info = full_rules[lc_form];
20347 0 0 if (info.words.empty())
20350 0 0 if (!info.count) full_rules.erase(lc_form);
20354 0 0 for (auto&& sentence : data)
20355 0 0 for (size_t i = 1, j = 0; i < sentence.words.size(); i++) {
20356 0 0 if (j < sentence.multiword_tokens.size() && sentence.multiword_tokens[j].id_first == int(i)) {
0 0 if (j < sentence.multiword_tokens.size() && sentence.multiword_tokens[j].id_first == int(i)) {
0 0 if (j < sentence.multiword_tokens.size() && sentence.multiword_tokens[j].id_first == int(i)) {
20363 0 0 if (it != full_rules.end())
20364 0 0 if (!--it->second.count)
20369 0 0 for (auto&& full_rule : full_rules) {
20371 0 0 while (prefix_match < full_rule.first.size() && prefix_match < full_rule.second.words[0].size()) prefix_match++;
0 0 while (prefix_match < full_rule.first.size() && prefix_match < full_rule.second.words[0].size()) prefix_match++;
0 0 while (prefix_match < full_rule.first.size() && prefix_match < full_rule.second.words[0].size()) prefix_match++;
20372 0 0 for (; prefix_match; prefix_match--)
20373 0 0 if (((unsigned char)full_rule.first[prefix_match]) < 0x80 || ((unsigned char)full_rule.first[prefix_match]) >= 0xC0) {
0 0 if (((unsigned char)full_rule.first[prefix_match]) < 0x80 || ((unsigned char)full_rule.first[prefix_match]) >= 0xC0) {
0 0 if (((unsigned char)full_rule.first[prefix_match]) < 0x80 || ((unsigned char)full_rule.first[prefix_match]) >= 0xC0) {
20374 0 0 lc_form.assign(full_rule.first, prefix_match, string::npos);
20376 0 0 lc_words[0].erase(0, prefix_match);
20378 0 0 auto& info = suffix_rules[lc_form];
20379 0 0 if (info.words.empty())
20382 0 0 if (!info.count) suffix_rules.erase(lc_form);
20387 0 0 for (auto&& sentence : data)
20388 0 0 for (size_t i = 1, j = 0; i < sentence.words.size(); i++) {
20389 0 0 if (j < sentence.multiword_tokens.size() && sentence.multiword_tokens[j].id_first == int(i)) {
0 0 if (j < sentence.multiword_tokens.size() && sentence.multiword_tokens[j].id_first == int(i)) {
0 0 if (j < sentence.multiword_tokens.size() && sentence.multiword_tokens[j].id_first == int(i)) {
20395 0 0 while (lc_form.size() > 1) {
20396 0 0 lc_form.erase(0, 1);
20398 0 0 if (it != suffix_rules.end()) {
20399 0 0 if (it->second.count <= 10)
20408 0 0 binary_encoder enc;
20410 0 0 for (auto&& full_rule : full_rules) {
20411 0 0 enc.add_str(full_rule.first);
20412 0 0 enc.add_1B(full_rule.second.words.size());
20413 0 0 for (auto& word : full_rule.second.words)
20414 0 0 enc.add_str(word);
20417 0 0 for (auto&& suffix_rule : suffix_rules) {
20418 0 0 enc.add_str(suffix_rule.first);
20419 0 0 enc.add_1B(suffix_rule.second.words.size());
20420 0 0 for (auto& word : suffix_rule.second.words)
20421 0 0 enc.add_str(word);
20425 0 0 os.put(multiword_splitter::VERSION_LATEST);
20426 0 0 if (!compressor::save(os, enc)) return error.assign("Cannot encode multiword_splitter!"), false;
0 0 if (!compressor::save(os, enc)) return error.assign("Cannot encode multiword_splitter!"), false;
0 0 if (!compressor::save(os, enc)) return error.assign("Cannot encode multiword_splitter!"), false;
20536 0 0 stringstream os_buffer;
20537 0 0 os_buffer.put(method.size());
20538 0 0 os_buffer.write(method.c_str(), method.size());
20541 0 0 if (method == "morphodita_parsito") {
20542 0 0 if (!trainer_morphodita_parsito::train(training, heldout, tokenizer, tagger, parser, os_buffer, error))
0 0 if (!trainer_morphodita_parsito::train(training, heldout, tokenizer, tagger, parser, os_buffer, error))
20545 0 0 error.assign("Unknown UDPipe method '").append(method).append("'!");
0 0 error.assign("Unknown UDPipe method '").append(method).append("'!");
20547 0 0 }
0 0 }
20553 0 0 os << os_buffer.rdbuf();
20571 0 0 for (unsigned i = 0; i < 10; i++)
0 0 for (unsigned i = 0; i < 10; i++)
0 0 for (unsigned i = 0; i < 10; i++)
0 0 for (unsigned i = 0; i < 10; i++)
0 0 for (unsigned i = 0; i < 10; i++)
0 0 for (unsigned i = 0; i < 10; i++)
0 0 for (unsigned i = 0; i < 10; i++)
0 0 for (unsigned i = 0; i < 10; i++)
0 0 for (unsigned i = 0; i < 10; i++)
0 0 for (unsigned i = 0; i < 10; i++)
20595 0 0 enc.add_1B(maps.size());
20596 0 0 for (auto&& map : maps)
20597 0 0 map.save(enc);
20599 0 0 return compressor::save(os, enc);
20622 0 0 for (auto&& description : ElementaryFeatures::descriptions)
20623 0 0 if (!elementary_map.emplace(description.name, description).second)
20624 0 0 training_failure("Repeated elementary feature with name " << description.name << '!');
20628 0 0 while (getline(is, line)) {
0 0 while (getline(is, line)) {
20629 0 0 split(line, ',', tokens);
20630 0 0 if (tokens.empty()) training_failure("Feature sequence cannot be empty!");
0 0 if (tokens.empty()) training_failure("Feature sequence cannot be empty!");
0 0 if (tokens.empty()) training_failure("Feature sequence cannot be empty!");
20633 0 0 sequences.emplace_back();
20634 0 0 for (auto&& token : tokens) {
20636 0 0 split(token, ' ', parts);
20637 0 0 if (parts.size() != 2) training_failure("Cannot parse feature sequence element '" << token << "'!");
0 0 if (parts.size() != 2) training_failure("Cannot parse feature sequence element '" << token << "'!");
0 0 if (parts.size() != 2) training_failure("Cannot parse feature sequence element '" << token << "'!");
20639 0 0 if (it == elementary_map.end()) training_failure("Unknown elementary feature '" << parts[0] << "' used in feature sequence '" << token << "'!");
0 0 if (it == elementary_map.end()) training_failure("Unknown elementary feature '" << parts[0] << "' used in feature sequence '" << token << "'!");
0 0 if (it == elementary_map.end()) training_failure("Unknown elementary feature '" << parts[0] << "' used in feature sequence '" << token << "'!");
20642 0 0 int sequence_index = parse_int(parts[1].c_str(), "sequence_index");
20643 0 0 if (desc.type == DYNAMIC && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of dynamic elementary feature '" << desc.name << "'!");
0 0 if (desc.type == DYNAMIC && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of dynamic elementary feature '" << desc.name << "'!");
0 0 if (desc.type == DYNAMIC && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of dynamic elementary feature '" << desc.name << "'!");
0 0 if (desc.type == DYNAMIC && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of dynamic elementary feature '" << desc.name << "'!");
0 0 if (desc.type == DYNAMIC && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of dynamic elementary feature '" << desc.name << "'!");
20644 0 0 if (desc.type == PER_TAG && (sequence_index > 0 || sequence_index <= -window_size)) training_failure("Wrong sequence index " << sequence_index << " of per-tag elementary feature '" << desc.name << "'!");
0 0 if (desc.type == PER_TAG && (sequence_index > 0 || sequence_index <= -window_size)) training_failure("Wrong sequence index " << sequence_index << " of per-tag elementary feature '" << desc.name << "'!");
0 0 if (desc.type == PER_TAG && (sequence_index > 0 || sequence_index <= -window_size)) training_failure("Wrong sequence index " << sequence_index << " of per-tag elementary feature '" << desc.name << "'!");
0 0 if (desc.type == PER_TAG && (sequence_index > 0 || sequence_index <= -window_size)) training_failure("Wrong sequence index " << sequence_index << " of per-tag elementary feature '" << desc.name << "'!");
0 0 if (desc.type == PER_TAG && (sequence_index > 0 || sequence_index <= -window_size)) training_failure("Wrong sequence index " << sequence_index << " of per-tag elementary feature '" << desc.name << "'!");
0 0 if (desc.type == PER_TAG && (sequence_index > 0 || sequence_index <= -window_size)) training_failure("Wrong sequence index " << sequence_index << " of per-tag elementary feature '" << desc.name << "'!");
20645 0 0 if (desc.range == ONLY_CURRENT && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of elementary feature '" << desc.name << "' requiring zero offset!");
0 0 if (desc.range == ONLY_CURRENT && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of elementary feature '" << desc.name << "' requiring zero offset!");
0 0 if (desc.range == ONLY_CURRENT && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of elementary feature '" << desc.name << "' requiring zero offset!");
0 0 if (desc.range == ONLY_CURRENT && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of elementary feature '" << desc.name << "' requiring zero offset!");
0 0 if (desc.range == ONLY_CURRENT && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of elementary feature '" << desc.name << "' requiring zero offset!");
20647 0 0 sequences.back().elements.emplace_back(it->second.type, it->second.index, sequence_index);
20648 0 0 if (desc.type == DYNAMIC) sequences.back().dependant_range = max(sequences.back().dependant_range, window_size + 1);
20649 0 0 if (desc.type == PER_TAG) sequences.back().dependant_range = max(sequences.back().dependant_range, 1 - sequence_index);
20652 0 0 if (contains_only_current && sequences.back().dependant_range > 1) training_failure("Feature sequence '" << line << "' contains both a non-local elementary feature and exclusively-local elementary feature!");
0 0 if (contains_only_current && sequences.back().dependant_range > 1) training_failure("Feature sequence '" << line << "' contains both a non-local elementary feature and exclusively-local elementary feature!");
0 0 if (contains_only_current && sequences.back().dependant_range > 1) training_failure("Feature sequence '" << line << "' contains both a non-local elementary feature and exclusively-local elementary feature!");
0 0 if (contains_only_current && sequences.back().dependant_range > 1) training_failure("Feature sequence '" << line << "' contains both a non-local elementary feature and exclusively-local elementary feature!");
0 0 if (contains_only_current && sequences.back().dependant_range > 1) training_failure("Feature sequence '" << line << "' contains both a non-local elementary feature and exclusively-local elementary feature!");
20656 0 0 scores.resize(sequences.size());
20661 0 0 if (!elementary.save(os)) return false;
20664 0 0 enc.add_1B(sequences.size());
20665 0 0 for (auto&& sequence : sequences) {
20667 0 0 enc.add_1B(sequence.elements.size());
20668 0 0 for (auto&& element : sequence.elements) {
20675 0 0 enc.add_1B(scores.size());
20676 0 0 for (auto&& score : scores)
20677 0 0 score.save(enc);
20679 0 0 return compressor::save(os, enc);
20700 0 0 class training_elementary_feature_map {
0 0 class training_elementary_feature_map {
20734 0 0 return it != map.end() ? it->second.alpha : 0;
20777 0 0 for (unsigned i = 0; i < map_indices.size(); i++) {
20778 0 0 for (auto&& element : features.sequences[i].elements)
20779 0 0 for (auto&& description : decltype(features.elementary)::descriptions)
20780 0 0 if (element.type == description.type && element.elementary_index == description.index)
0 0 if (element.type == description.type && element.elementary_index == description.index)
20781 0 0 map_indices[i].emplace_back(description.map_index);
20783 0 0 assert(map_indices[i].size() == features.sequences[i].elements.size());
20787 0 0 vector> counts(elementary.maps.size());
20789 0 0 for (unsigned i = 0; i < features.sequences.size(); i++)
20790 0 0 for (auto&& element : features.scores[i].map)
20791 0 0 if (element.second.gamma) {
20793 0 0 for (const char* key = element.first.c_str(); key != element.first.c_str() + element.first.size(); assert(key <= element.first.c_str() + element.first.size()))
0 0 for (const char* key = element.first.c_str(); key != element.first.c_str() + element.first.size(); assert(key <= element.first.c_str() + element.first.size()))
20794 0 0 elementary_ids.emplace_back(vli::decode(key));
20796 0 0 assert(elementary_ids.size() == features.sequences[i].elements.size());
20797 0 0 for (unsigned j = 0; j < elementary_ids.size(); j++) {
20798 0 0 if (map_indices[i][j] < 0) continue;
20799 0 0 if (elementary_ids[j] >= counts[map_indices[i][j]].size()) counts[map_indices[i][j]].resize(elementary_ids[j] + 1);
0 0 if (elementary_ids[j] >= counts[map_indices[i][j]].size()) counts[map_indices[i][j]].resize(elementary_ids[j] + 1);
20805 0 0 for (auto&& count : counts) {
20806 0 0 if (elementary_feature_empty >= count.size()) count.resize(elementary_feature_empty + 1);
0 0 if (elementary_feature_empty >= count.size()) count.resize(elementary_feature_empty + 1);
20809 0 0 for (elementary_feature_value i = 0; i < count.size(); i++) count[i].ori = i;
20814 0 0 vector> elementary_ids_map(counts.size());
20815 0 0 for (unsigned i = 0; i < counts.size(); i++) {
20816 0 0 elementary_ids_map[i].resize(counts[i].size());
20817 0 0 for (elementary_feature_value j = 0; j < counts[i].size(); j++)
20818 0 0 elementary_ids_map[i][counts[i][j].ori] = counts[i][j].count ? j : elementary_feature_unknown;
20823 0 0 for (unsigned i = 0; i < elementary.maps.size(); i++) {
20825 0 0 for (auto&& element : elementary.maps[i].map)
20826 0 0 if (element.second < elementary_ids_map[i].size() && elementary_ids_map[i][element.second] != elementary_feature_unknown)
0 0 if (element.second < elementary_ids_map[i].size() && elementary_ids_map[i][element.second] != elementary_feature_unknown)
0 0 if (element.second < elementary_ids_map[i].size() && elementary_ids_map[i][element.second] != elementary_feature_unknown)
20829 0 0 optimized_elementary.maps.emplace_back(persistent_unordered_map(mapped_ids, 1, [](binary_encoder& enc, int id) {
0 0 optimized_elementary.maps.emplace_back(persistent_unordered_map(mapped_ids, 1, [](binary_encoder& enc, int id) {
20835 0 0 optimized_features.sequences = features.sequences;
20838 0 0 for (unsigned i = 0; i < features.sequences.size(); i++) {
20840 0 0 for (auto&& element : features.scores[i].map)
20841 0 0 if (element.second.gamma) {
20843 0 0 for (const char* key = element.first.c_str(); key < element.first.c_str() + element.first.size(); )
20844 0 0 elementary_ids.emplace_back(vli::decode(key));
20846 0 0 assert(elementary_ids.size() == features.sequences[i].elements.size());
20847 0 0 for (unsigned j = 0; j < elementary_ids.size(); j++) {
20848 0 0 if (map_indices[i][j] < 0) continue;
20849 0 0 assert(elementary_ids[j] < elementary_ids_map[map_indices[i][j]].size() && elementary_ids_map[map_indices[i][j]][elementary_ids[j]] != elementary_feature_unknown);
0 0 assert(elementary_ids[j] < elementary_ids_map[map_indices[i][j]].size() && elementary_ids_map[map_indices[i][j]][elementary_ids[j]] != elementary_feature_unknown);
20853 0 0 key_buffer.resize(elementary_ids.size() * vli::max_length());
20855 0 0 for (unsigned j = 0; j < elementary_ids.size(); j++)
20861 0 0 optimized_features.scores.emplace_back(persistent_unordered_map(updated_map, 1, [](binary_encoder& enc, const training_feature_sequence_map::info& info) {
0 0 optimized_features.scores.emplace_back(persistent_unordered_map(updated_map, 1, [](binary_encoder& enc, const training_feature_sequence_map::info& info) {
20862 0 0 assert(feature_sequence_score(info.gamma) == info.gamma);
20928 0 0 if (!d) training_failure("Cannot load dictionary!");
0 0 if (!d) training_failure("Cannot load dictionary!");
0 0 if (!d) training_failure("Cannot load dictionary!");
20930 0 0 if (!in_morpho_dict.seekg(0, istream::beg)) training_failure("Cannot seek in dictionary file to the beginning!");
0 0 if (!in_morpho_dict.seekg(0, istream::beg)) training_failure("Cannot seek in dictionary file to the beginning!");
0 0 if (!in_morpho_dict.seekg(0, istream::beg)) training_failure("Cannot seek in dictionary file to the beginning!");
0 0 if (!in_morpho_dict.seekg(0, istream::beg)) training_failure("Cannot seek in dictionary file to the beginning!");
20935 0 0 load_data(in_train, *d, use_guesser, train_data, true);
20938 0 0 if (in_heldout) {
20941 0 0 load_data(in_heldout, *d, use_guesser, heldout_data, false);
20946 0 0 out_tagger << in_morpho_dict.rdbuf();
20947 0 0 out_tagger.put(use_guesser);
20950 0 0 TaggerTrainer::train(decoding_order, window_size, iterations, train_data, heldout_data, early_stopping, prune_features, in_feature_templates, out_tagger);
20961 0 0 sentences.emplace_back();
20962 0 0 while (getline(is, line)) {
0 0 while (getline(is, line)) {
20963 0 0 if (line.empty()) {
20964 0 0 if (!sentences.back().words.empty())
20965 0 0 sentences.emplace_back();
20969 0 0 split(line, '\t', tokens);
20970 0 0 if (tokens.size() != 3) training_failure("The tagger data line '" << line << "' does not contain three columns!");
0 0 if (tokens.size() != 3) training_failure("The tagger data line '" << line << "' does not contain three columns!");
0 0 if (tokens.size() != 3) training_failure("The tagger data line '" << line << "' does not contain three columns!");
20975 0 0 s.words.emplace_back(tokens[0]);
20976 0 0 s.gold.emplace_back(tokens[1], tokens[2]);
20977 0 0 s.gold_index.emplace_back(-1);
20980 0 0 s.analyses.emplace_back();
20981 0 0 d.analyze(tokens[0], use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, s.analyses.back());
0 0 d.analyze(tokens[0], use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, s.analyses.back());
20984 0 0 for (size_t i = 0; i < s.analyses.back().size(); i++)
20985 0 0 if (s.analyses.back()[i].lemma == s.gold.back().lemma && s.analyses.back()[i].tag == s.gold.back().tag) {
0 0 if (s.analyses.back()[i].lemma == s.gold.back().lemma && s.analyses.back()[i].tag == s.gold.back().tag) {
0 0 if (s.analyses.back()[i].lemma == s.gold.back().lemma && s.analyses.back()[i].tag == s.gold.back().tag) {
20990 0 0 if (s.gold_index.back() == -1 && add_gold) {
0 0 if (s.gold_index.back() == -1 && add_gold) {
0 0 if (s.gold_index.back() == -1 && add_gold) {
20992 0 0 s.analyses.back().emplace_back(tokens[1], tokens[2]);
20995 0 0 if (!sentences.empty() && sentences.back().words.empty()) sentences.pop_back();
0 0 if (!sentences.empty() && sentences.back().words.empty()) sentences.pop_back();
0 0 if (!sentences.empty() && sentences.back().words.empty()) sentences.pop_back();
20998 0 0 for (auto&& sentence : sentences)
20999 0 0 for (auto&& word : sentence.words)
21000 0 0 sentence.forms.emplace_back(string_piece(word.c_str(), d.raw_form_len(word)));
0 0 sentence.forms.emplace_back(string_piece(word.c_str(), d.raw_form_len(word)));
21040 0 0 features.parse(window_size, in_feature_templates);
21043 0 0 train_viterbi(decoding_order, window_size, iterations, train, heldout, early_stopping, prune_features, features);
21048 0 0 optimizer::optimize(features, optimized_features);
21049 0 0 if (!optimized_features.save(out_tagger)) training_failure("Cannot save feature sequences!");
0 0 if (!optimized_features.save(out_tagger)) training_failure("Cannot save feature sequences!");
0 0 if (!optimized_features.save(out_tagger)) training_failure("Cannot save feature sequences!");
0 0 if (!optimized_features.save(out_tagger)) training_failure("Cannot save feature sequences!");
21058 0 0 typename decltype(decoder)::cache decoder_cache(decoder);
21060 0 0 typename FeatureSequences::cache feature_sequences_cache(features);
21064 0 0 vector window(window_size);
21067 0 0 if (prune_features)
21068 0 0 for (unsigned s = 0; s < train.size(); s++) {
21070 0 0 features.initialize_sentence(sentence.forms, sentence.analyses, feature_sequences_cache);
21071 0 0 for (int i = 0; i < int(sentence.forms.size()); i++) {
21073 0 0 for (int j = 0; j < window_size && i - j >= 0; j++) window[j] = sentence.gold_index[i - j];
0 0 for (int j = 0; j < window_size && i - j >= 0; j++) window[j] = sentence.gold_index[i - j];
21076 0 0 features.feature_keys(i, window.data(), 0, gold_dynamic_features, gold_feature_sequences_keys, feature_sequences_cache);
21078 0 0 for (unsigned f = 0; f < features.scores.size(); f++)
21079 0 0 if (!gold_feature_sequences_keys[f].empty())
21085 0 0 for (int i = 0; i < iterations; i++) {
21088 0 0 cerr << "Iteration " << i + 1 << ": ";
0 0 cerr << "Iteration " << i + 1 << ": ";
21091 0 0 for (unsigned s = 0; s < train.size(); s++) {
21095 0 0 if (tags.size() < sentence.forms.size()) tags.resize(2 * sentence.forms.size());
0 0 if (tags.size() < sentence.forms.size()) tags.resize(2 * sentence.forms.size());
21096 0 0 decoder.tag(sentence.forms, sentence.analyses, decoder_cache, tags);
21099 0 0 features.initialize_sentence(sentence.forms, sentence.analyses, feature_sequences_cache);
21100 0 0 for (int i = 0; i < int(sentence.forms.size()); i++) {
21105 0 0 for (int j = 0; j < window_size && i - j >= 0; j++) window[j] = tags[i - j];
0 0 for (int j = 0; j < window_size && i - j >= 0; j++) window[j] = tags[i - j];
21107 0 0 features.feature_keys(i, window.data(), 0, decoded_dynamic_features, decoded_feature_sequences_keys, feature_sequences_cache);
21109 0 0 for (int j = 0; j < window_size && i - j >= 0; j++) window[j] = sentence.gold_index[i - j];
0 0 for (int j = 0; j < window_size && i - j >= 0; j++) window[j] = sentence.gold_index[i - j];
21111 0 0 features.feature_keys(i, window.data(), 0, gold_dynamic_features, gold_feature_sequences_keys, feature_sequences_cache);
21113 0 0 for (unsigned f = 0; f < features.scores.size(); f++) {
21114 0 0 if (decoded_feature_sequences_keys[f] != gold_feature_sequences_keys[f]) {
21115 0 0 if (!decoded_feature_sequences_keys[f].empty()) {
21117 0 0 if (it == features.scores[f].map.end() && !prune_features) it = features.scores[f].map.emplace(decoded_feature_sequences_keys[f], typename decltype(features.scores[f].map)::mapped_type()).first;
0 0 if (it == features.scores[f].map.end() && !prune_features) it = features.scores[f].map.emplace(decoded_feature_sequences_keys[f], typename decltype(features.scores[f].map)::mapped_type()).first;
0 0 if (it == features.scores[f].map.end() && !prune_features) it = features.scores[f].map.emplace(decoded_feature_sequences_keys[f], typename decltype(features.scores[f].map)::mapped_type()).first;
21118 0 0 if (it != features.scores[f].map.end()) {
21126 0 0 if (!gold_feature_sequences_keys[f].empty()) {
21128 0 0 if (it == features.scores[f].map.end() && !prune_features) it = features.scores[f].map.emplace(gold_feature_sequences_keys[f], typename decltype(features.scores[f].map)::mapped_type()).first;
0 0 if (it == features.scores[f].map.end() && !prune_features) it = features.scores[f].map.emplace(gold_feature_sequences_keys[f], typename decltype(features.scores[f].map)::mapped_type()).first;
0 0 if (it == features.scores[f].map.end() && !prune_features) it = features.scores[f].map.emplace(gold_feature_sequences_keys[f], typename decltype(features.scores[f].map)::mapped_type()).first;
21129 0 0 if (it != features.scores[f].map.end()) {
21142 0 0 for (auto&& score : features.scores)
21143 0 0 for (auto&& element : score.map) {
21150 0 0 if (!heldout.empty()) {
21156 0 0 optimizer::optimize(features, frozen_features);
21158 0 0 typename decltype(frozen_decoder)::cache frozen_decoder_cache(frozen_decoder);
21160 0 0 for (auto&& sentence : heldout) {
21161 0 0 if (tags.size() < sentence.forms.size()) tags.resize(sentence.forms.size() * 2);
0 0 if (tags.size() < sentence.forms.size()) tags.resize(sentence.forms.size() * 2);
21162 0 0 frozen_decoder.tag(sentence.forms, sentence.analyses, frozen_decoder_cache, tags);
21164 0 0 for (unsigned i = 0; i < sentence.forms.size(); i++) {
21167 0 0 heldout_correct[BOTH] += sentence.gold[i].tag == sentence.analyses[i][tags[i]].tag && sentence.gold[i].lemma == sentence.analyses[i][tags[i]].lemma;
0 0 heldout_correct[BOTH] += sentence.gold[i].tag == sentence.analyses[i][tags[i]].tag && sentence.gold[i].lemma == sentence.analyses[i][tags[i]].lemma;
21172 0 0 if (early_stopping && heldout_correct[BOTH] > best_correct) {
0 0 if (early_stopping && heldout_correct[BOTH] > best_correct) {
21175 0 0 best_features = features;
21178 0 0 cerr << ", heldout accuracy " << fixed << setprecision(2)
21186 0 0 if (early_stopping && best_iteration >= 0) {
21187 0 0 cerr << "Chosen tagger model from iteration " << best_iteration + 1 << endl;
21188 0 0 features = best_features;
21286 0 0 for (auto&& sentence : training)
21287 0 0 for (size_t i = 1; i < sentence.words.size(); i++)
21288 0 0 if (!can_combine_tag(sentence.words[i], error))
21290 0 0 for (auto&& sentence : heldout)
21291 0 0 for (size_t i = 1; i < sentence.words.size(); i++)
21292 0 0 if (!can_combine_tag(sentence.words[i], error))
21295 0 0 if (!train_tokenizer(training, heldout, tokenizer, os, error)) return false;
21298 0 0 ostringstream os_tagger;
21299 0 0 if (!train_tagger(training, heldout, tagger, os_tagger, error)) return false;
0 0 if (!train_tagger(training, heldout, tagger, os_tagger, error)) return false;
21301 0 0 os.write(tagger_model.data(), tagger_model.size());
21303 0 0 if (!train_parser(training, heldout, parser, tagger_model, os, error)) return false;
0 0 if (!train_parser(training, heldout, parser, tagger_model, os, error)) return false;
21310 0 0 if (options == NONE) {
21315 0 0 if (!named_values::parse(options, tokenizer, error)) return false;
0 0 if (!named_values::parse(options, tokenizer, error)) return false;
21316 0 0 int run = 0; if (!option_int(tokenizer, "run", run, error)) return false;
0 0 int run = 0; if (!option_int(tokenizer, "run", run, error)) return false;
0 0 int run = 0; if (!option_int(tokenizer, "run", run, error)) return false;
21318 0 0 if (tokenizer.count("from_model")) {
0 0 if (tokenizer.count("from_model")) {
21321 0 0 if (!load_model(tokenizer["from_model"], TOKENIZER_MODEL, tokenizer_data))
0 0 if (!load_model(tokenizer["from_model"], TOKENIZER_MODEL, tokenizer_data))
0 0 if (!load_model(tokenizer["from_model"], TOKENIZER_MODEL, tokenizer_data))
21322 0 0 return error.assign("Cannot load model from which the tokenizer should be used!"), false;
21325 0 0 os.write(tokenizer_data.str, tokenizer_data.len);
21327 0 0 os.put(1);
21328 0 0 const string& model = option_str(tokenizer, "model");
0 0 const string& model = option_str(tokenizer, "model");
21331 0 0 if (model == "generic") {
21332 0 0 os.put(morphodita::tokenizer_id::GENERIC);
21334 0 0 } else if (model.empty() || model == "gru") {
0 0 } else if (model.empty() || model == "gru") {
0 0 } else if (model.empty() || model == "gru") {
21337 0 0 if (tokenizer.count("detokenize")) {
0 0 if (tokenizer.count("detokenize")) {
21338 0 0 detokenizer.reset(new udpipe::detokenizer(tokenizer["detokenize"]));
0 0 detokenizer.reset(new udpipe::detokenizer(tokenizer["detokenize"]));
0 0 detokenizer.reset(new udpipe::detokenizer(tokenizer["detokenize"]));
21339 0 0 if (!detokenizer) return error.assign("Cannot create detokenizer!"), false;
0 0 if (!detokenizer) return error.assign("Cannot create detokenizer!"), false;
21345 0 0 for (size_t training_sentence = 0; training_sentence < training.size(); training_sentence++) {
21346 0 0 sentence s = training[training_sentence];
21347 0 0 if (detokenizer) detokenizer->detokenize(s);
0 0 if (detokenizer) detokenizer->detokenize(s);
21349 0 0 auto& sentence = (sentences.emplace_back(), sentences.back());
21351 0 0 for (size_t i = 1, j = 0; i < s.words.size(); i++) {
21352 0 0 const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ?
21353 0 0 (const token&)s.multiword_tokens[j] : (const token&)s.words[i];
21355 0 0 sentence.tokens.emplace_back(sentence.sentence.size(), 0);
21356 0 0 for (auto&& chr : unilib::utf8::decoder(tok.form)) {
21357 0 0 sentence.sentence.push_back(chr);
21358 0 0 if (unilib::unicode::category(chr) & unilib::unicode::Zs) spaces_in_training = true;
21362 0 0 if (tok.get_space_after()) sentence.sentence.push_back(' ');
0 0 if (tok.get_space_after()) sentence.sentence.push_back(' ');
0 0 if (tok.get_space_after()) sentence.sentence.push_back(' ');
21364 0 0 if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i))
0 0 if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i))
0 0 if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i))
21367 0 0 if (training_sentence + 1 < training.size() && (training[training_sentence + 1].get_new_doc() || training[training_sentence + 1].get_new_par()))
0 0 if (training_sentence + 1 < training.size() && (training[training_sentence + 1].get_new_doc() || training[training_sentence + 1].get_new_par()))
0 0 if (training_sentence + 1 < training.size() && (training[training_sentence + 1].get_new_doc() || training[training_sentence + 1].get_new_par()))
0 0 if (training_sentence + 1 < training.size() && (training[training_sentence + 1].get_new_doc() || training[training_sentence + 1].get_new_par()))
0 0 if (training_sentence + 1 < training.size() && (training[training_sentence + 1].get_new_doc() || training[training_sentence + 1].get_new_par()))
0 0 if (training_sentence + 1 < training.size() && (training[training_sentence + 1].get_new_doc() || training[training_sentence + 1].get_new_par()))
21374 0 0 bool detokenize_handout = true; if (!option_bool(tokenizer, "detokenize_handout", detokenize_handout, error)) return false;
0 0 bool detokenize_handout = true; if (!option_bool(tokenizer, "detokenize_handout", detokenize_handout, error)) return false;
0 0 bool detokenize_handout = true; if (!option_bool(tokenizer, "detokenize_handout", detokenize_handout, error)) return false;
21375 0 0 for (size_t heldout_sentence = 0; heldout_sentence < heldout.size(); heldout_sentence++) {
21376 0 0 sentence s = heldout[heldout_sentence];
21377 0 0 if (detokenizer && detokenize_handout) detokenizer->detokenize(s);
0 0 if (detokenizer && detokenize_handout) detokenizer->detokenize(s);
0 0 if (detokenizer && detokenize_handout) detokenizer->detokenize(s);
0 0 if (detokenizer && detokenize_handout) detokenizer->detokenize(s);
21379 0 0 auto& sentence = (heldout_sentences.emplace_back(), heldout_sentences.back());
21381 0 0 for (size_t i = 1, j = 0; i < s.words.size(); i++) {
21382 0 0 const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ?
21383 0 0 (const token&)s.multiword_tokens[j] : (const token&)s.words[i];
21385 0 0 sentence.tokens.emplace_back(sentence.sentence.size(), 0);
21386 0 0 for (auto&& chr : unilib::utf8::decoder(tok.form))
21387 0 0 sentence.sentence.push_back(chr);
21390 0 0 if (tok.get_space_after()) sentence.sentence.push_back(' ');
0 0 if (tok.get_space_after()) sentence.sentence.push_back(' ');
0 0 if (tok.get_space_after()) sentence.sentence.push_back(' ');
21392 0 0 if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i))
0 0 if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i))
0 0 if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i))
21395 0 0 if (heldout_sentence + 1 < heldout.size() && (heldout[heldout_sentence + 1].get_new_doc() || heldout[heldout_sentence + 1].get_new_par()))
0 0 if (heldout_sentence + 1 < heldout.size() && (heldout[heldout_sentence + 1].get_new_doc() || heldout[heldout_sentence + 1].get_new_par()))
0 0 if (heldout_sentence + 1 < heldout.size() && (heldout[heldout_sentence + 1].get_new_doc() || heldout[heldout_sentence + 1].get_new_par()))
0 0 if (heldout_sentence + 1 < heldout.size() && (heldout[heldout_sentence + 1].get_new_doc() || heldout[heldout_sentence + 1].get_new_par()))
0 0 if (heldout_sentence + 1 < heldout.size() && (heldout[heldout_sentence + 1].get_new_doc() || heldout[heldout_sentence + 1].get_new_par()))
0 0 if (heldout_sentence + 1 < heldout.size() && (heldout[heldout_sentence + 1].get_new_doc() || heldout[heldout_sentence + 1].get_new_par()))
21400 0 0 bool tokenize_url = true; if (!option_bool(tokenizer, "tokenize_url", tokenize_url, error)) return false;
0 0 bool tokenize_url = true; if (!option_bool(tokenizer, "tokenize_url", tokenize_url, error)) return false;
0 0 bool tokenize_url = true; if (!option_bool(tokenizer, "tokenize_url", tokenize_url, error)) return false;
21401 0 0 int segment_size = 50; if (!option_int(tokenizer, "segment_size", segment_size, error)) return false;
0 0 int segment_size = 50; if (!option_int(tokenizer, "segment_size", segment_size, error)) return false;
0 0 int segment_size = 50; if (!option_int(tokenizer, "segment_size", segment_size, error)) return false;
21402 0 0 bool allow_spaces = spaces_in_training; if (!option_bool(tokenizer, "allow_spaces", allow_spaces, error)) return false;
0 0 bool allow_spaces = spaces_in_training; if (!option_bool(tokenizer, "allow_spaces", allow_spaces, error)) return false;
0 0 bool allow_spaces = spaces_in_training; if (!option_bool(tokenizer, "allow_spaces", allow_spaces, error)) return false;
21403 0 0 int dimension = 24; if (!option_int(tokenizer, "dimension", dimension, error)) return false;
0 0 int dimension = 24; if (!option_int(tokenizer, "dimension", dimension, error)) return false;
0 0 int dimension = 24; if (!option_int(tokenizer, "dimension", dimension, error)) return false;
21404 0 0 int epochs = 100; if (!option_int(tokenizer, "epochs", epochs, error)) return false;
0 0 int epochs = 100; if (!option_int(tokenizer, "epochs", epochs, error)) return false;
0 0 int epochs = 100; if (!option_int(tokenizer, "epochs", epochs, error)) return false;
21405 0 0 int batch_size = run <= 1 ? 50 : 50 + 50 * hyperparameter_integer(run, 1, 0, 1);
21406 0 0 if (!option_int(tokenizer, "batch_size", batch_size, error)) return false;
0 0 if (!option_int(tokenizer, "batch_size", batch_size, error)) return false;
0 0 if (!option_int(tokenizer, "batch_size", batch_size, error)) return false;
21407 0 0 double learning_rate = run <= 1 ? 0.005 : hyperparameter_logarithmic(run, 2, 0.0005, 0.01);
21408 0 0 if (!option_double(tokenizer, "learning_rate", learning_rate, error)) return false;
0 0 if (!option_double(tokenizer, "learning_rate", learning_rate, error)) return false;
0 0 if (!option_double(tokenizer, "learning_rate", learning_rate, error)) return false;
21409 0 0 double learning_rate_final = 0.0; if (!option_double(tokenizer, "learning_rate_final", learning_rate_final, error)) return false;
0 0 double learning_rate_final = 0.0; if (!option_double(tokenizer, "learning_rate_final", learning_rate_final, error)) return false;
0 0 double learning_rate_final = 0.0; if (!option_double(tokenizer, "learning_rate_final", learning_rate_final, error)) return false;
21410 0 0 double dropout = 0.1; if (!option_double(tokenizer, "dropout", dropout, error)) return false;
0 0 double dropout = 0.1; if (!option_double(tokenizer, "dropout", dropout, error)) return false;
0 0 double dropout = 0.1; if (!option_double(tokenizer, "dropout", dropout, error)) return false;
21411 0 0 double initialization_range = 0.5; if (!option_double(tokenizer, "initialization_range", initialization_range, error)) return false;
0 0 double initialization_range = 0.5; if (!option_double(tokenizer, "initialization_range", initialization_range, error)) return false;
0 0 double initialization_range = 0.5; if (!option_double(tokenizer, "initialization_range", initialization_range, error)) return false;
21412 0 0 bool early_stopping = !heldout_sentences.empty(); if (!option_bool(tokenizer, "early_stopping", early_stopping, error)) return false;
0 0 bool early_stopping = !heldout_sentences.empty(); if (!option_bool(tokenizer, "early_stopping", early_stopping, error)) return false;
0 0 bool early_stopping = !heldout_sentences.empty(); if (!option_bool(tokenizer, "early_stopping", early_stopping, error)) return false;
21414 0 0 if (run >= 1) cerr << "Random search run " << run << ", batch_size=" << batch_size
0 0 if (run >= 1) cerr << "Random search run " << run << ", batch_size=" << batch_size
0 0 if (run >= 1) cerr << "Random search run " << run << ", batch_size=" << batch_size
21417 0 0 cerr << "Training tokenizer with the following options: " << "tokenize_url=" << (tokenize_url ? 1 : 0)
0 0 cerr << "Training tokenizer with the following options: " << "tokenize_url=" << (tokenize_url ? 1 : 0)
21418 0 0 << ", allow_spaces=" << (allow_spaces ? 1 : 0) << ", dimension=" << dimension << endl
0 0 << ", allow_spaces=" << (allow_spaces ? 1 : 0) << ", dimension=" << dimension << endl
0 0 << ", allow_spaces=" << (allow_spaces ? 1 : 0) << ", dimension=" << dimension << endl
21419 0 0 << " epochs=" << epochs << ", batch_size=" << batch_size << ", segment_size=" << segment_size
0 0 << " epochs=" << epochs << ", batch_size=" << batch_size << ", segment_size=" << segment_size
0 0 << " epochs=" << epochs << ", batch_size=" << batch_size << ", segment_size=" << segment_size
21421 0 0 << " dropout=" << dropout << ", early_stopping=" << (early_stopping ? 1 : 0) << endl;
0 0 << " dropout=" << dropout << ", early_stopping=" << (early_stopping ? 1 : 0) << endl;
21424 0 0 os.put(morphodita::tokenizer_ids::GRU);
21425 0 0 if (!morphodita::gru_tokenizer_trainer::train(tokenize_url ? morphodita::gru_tokenizer_trainer::URL_EMAIL_LATEST : 0,
0 0 if (!morphodita::gru_tokenizer_trainer::train(tokenize_url ? morphodita::gru_tokenizer_trainer::URL_EMAIL_LATEST : 0,
0 0 if (!morphodita::gru_tokenizer_trainer::train(tokenize_url ? morphodita::gru_tokenizer_trainer::URL_EMAIL_LATEST : 0,
21431 0 0 return error.assign("Unknown tokenizer model '").append(model).append("'!"), false;
0 0 return error.assign("Unknown tokenizer model '").append(model).append("'!"), false;
21435 0 0 if (!multiword_splitter_trainer::train(training, os, error)) return false;
0 0 if (!multiword_splitter_trainer::train(training, os, error)) return false;
21444 0 0 if (options == NONE) {
21449 0 0 if (!named_values::parse(options, tagger, error)) return false;
0 0 if (!named_values::parse(options, tagger, error)) return false;
21451 0 0 if (tagger.count("from_model")) {
0 0 if (tagger.count("from_model")) {
21454 0 0 string model_name = "from_model";
21456 0 0 do {
21457 0 0 taggers_data.emplace_back();
21458 0 0 if (!load_model(tagger[model_name], TAGGER_MODEL, taggers_data.back()))
0 0 if (!load_model(tagger[model_name], TAGGER_MODEL, taggers_data.back()))
21459 0 0 return error.assign("Cannot load model from which the tagger should be used!"), false;
21460 0 0 if (taggers_data.back().str[0]) {
21463 0 0 vector overrides = {"lemma", "xpostag", "feats"};
0 0 vector overrides = {"lemma", "xpostag", "feats"};
0 0 vector overrides = {"lemma", "xpostag", "feats"};
0 0 vector overrides = {"lemma", "xpostag", "feats"};
0 0 vector overrides = {"lemma", "xpostag", "feats"};
0 0 vector overrides = {"lemma", "xpostag", "feats"};
21464 0 0 for (size_t i = 0; i < overrides.size(); i++) {
21465 0 0 string override_name = "from_model_" + overrides[i];
21467 0 0 if (!option_int(tagger, override_name, override_value, error, model_index)) return false;
0 0 if (!option_int(tagger, override_name, override_value, error, model_index)) return false;
21468 0 0 if (override_value >= 0)
21474 0 0 model_name = "from_model_" + to_string(1 + ++model_index);
21476 0 0 if (taggers_total < 0 || taggers_total > 4) return error.assign("Cannot create more than four tagger models!"), false;
0 0 if (taggers_total < 0 || taggers_total > 4) return error.assign("Cannot create more than four tagger models!"), false;
21479 0 0 os.put(taggers_total);
21480 0 0 for (auto&& tagger_data : taggers_data)
21481 0 0 os.write(tagger_data.str + 1, tagger_data.len - 1);
21484 0 0 int models = 1; if (!option_int(tagger, "models", models, error)) return false;
0 0 int models = 1; if (!option_int(tagger, "models", models, error)) return false;
0 0 int models = 1; if (!option_int(tagger, "models", models, error)) return false;
21485 0 0 if (models <= 0) return error.assign("Number of tagger models cannot be negative or zero!"), false;
0 0 if (models <= 0) return error.assign("Number of tagger models cannot be negative or zero!"), false;
21486 0 0 if (models > 4) return error.assign("Cannot create more than four tagger models!"), false;
0 0 if (models > 4) return error.assign("Cannot create more than four tagger models!"), false;
21488 0 0 os.put(models);
21489 0 0 for (int model = 0; model < models; model++)
21490 0 0 if (!train_tagger_model(training, heldout, model, models, tagger, os, error))
0 0 if (!train_tagger_model(training, heldout, model, models, tagger, os, error))
21500 0 0 if (options == NONE) {
21505 0 0 if (!named_values::parse(options, parser, error)) return false;
0 0 if (!named_values::parse(options, parser, error)) return false;
21506 0 0 int run = 0; if (!option_int(parser, "run", run, error)) return false;
0 0 int run = 0; if (!option_int(parser, "run", run, error)) return false;
0 0 int run = 0; if (!option_int(parser, "run", run, error)) return false;
21508 0 0 if (parser.count("from_model")) {
0 0 if (parser.count("from_model")) {
21511 0 0 if (!load_model(parser["from_model"], PARSER_MODEL, parser_data))
0 0 if (!load_model(parser["from_model"], PARSER_MODEL, parser_data))
0 0 if (!load_model(parser["from_model"], PARSER_MODEL, parser_data))
21512 0 0 return error.assign("Cannot load model from which the parser should be used!"), false;
21515 0 0 os.write(parser_data.str, parser_data.len);
21517 0 0 os.put(1);
21520 0 0 string transition_system = parser.count("transition_system") ? parser["transition_system"] : "projective";
0 0 string transition_system = parser.count("transition_system") ? parser["transition_system"] : "projective";
0 0 string transition_system = parser.count("transition_system") ? parser["transition_system"] : "projective";
0 0 string transition_system = parser.count("transition_system") ? parser["transition_system"] : "projective";
21521 0 0 string transition_oracle = parser.count("transition_oracle") ? parser["transition_oracle"] :
0 0 string transition_oracle = parser.count("transition_oracle") ? parser["transition_oracle"] :
0 0 string transition_oracle = parser.count("transition_oracle") ? parser["transition_oracle"] :
0 0 string transition_oracle = parser.count("transition_oracle") ? parser["transition_oracle"] :
21524 0 0 "static";
0 0 "static";
0 0 "static";
0 0 "static";
21526 0 0 int embedding_upostag = 20; if (!option_int(parser, "embedding_upostag", embedding_upostag, error)) return false;
0 0 int embedding_upostag = 20; if (!option_int(parser, "embedding_upostag", embedding_upostag, error)) return false;
0 0 int embedding_upostag = 20; if (!option_int(parser, "embedding_upostag", embedding_upostag, error)) return false;
21527 0 0 int embedding_feats = 20; if (!option_int(parser, "embedding_feats", embedding_feats, error)) return false;
0 0 int embedding_feats = 20; if (!option_int(parser, "embedding_feats", embedding_feats, error)) return false;
0 0 int embedding_feats = 20; if (!option_int(parser, "embedding_feats", embedding_feats, error)) return false;
21528 0 0 int embedding_xpostag = 0; if (!option_int(parser, "embedding_xpostag", embedding_xpostag, error)) return false;
0 0 int embedding_xpostag = 0; if (!option_int(parser, "embedding_xpostag", embedding_xpostag, error)) return false;
0 0 int embedding_xpostag = 0; if (!option_int(parser, "embedding_xpostag", embedding_xpostag, error)) return false;
21529 0 0 int embedding_form = 50; if (!option_int(parser, "embedding_form", embedding_form, error)) return false;
0 0 int embedding_form = 50; if (!option_int(parser, "embedding_form", embedding_form, error)) return false;
0 0 int embedding_form = 50; if (!option_int(parser, "embedding_form", embedding_form, error)) return false;
21530 0 0 int embedding_form_mincount = 2; if (!option_int(parser, "embedding_form_mincount", embedding_form_mincount, error)) return false;
0 0 int embedding_form_mincount = 2; if (!option_int(parser, "embedding_form_mincount", embedding_form_mincount, error)) return false;
0 0 int embedding_form_mincount = 2; if (!option_int(parser, "embedding_form_mincount", embedding_form_mincount, error)) return false;
21531 0 0 int embedding_lemma = 0; if (!option_int(parser, "embedding_lemma", embedding_lemma, error)) return false;
0 0 int embedding_lemma = 0; if (!option_int(parser, "embedding_lemma", embedding_lemma, error)) return false;
0 0 int embedding_lemma = 0; if (!option_int(parser, "embedding_lemma", embedding_lemma, error)) return false;
21532 0 0 int embedding_lemma_mincount = 2; if (!option_int(parser, "embedding_lemma_mincount", embedding_lemma_mincount, error)) return false;
0 0 int embedding_lemma_mincount = 2; if (!option_int(parser, "embedding_lemma_mincount", embedding_lemma_mincount, error)) return false;
0 0 int embedding_lemma_mincount = 2; if (!option_int(parser, "embedding_lemma_mincount", embedding_lemma_mincount, error)) return false;
21533 0 0 int embedding_deprel = 20; if (!option_int(parser, "embedding_deprel", embedding_deprel, error)) return false;
0 0 int embedding_deprel = 20; if (!option_int(parser, "embedding_deprel", embedding_deprel, error)) return false;
0 0 int embedding_deprel = 20; if (!option_int(parser, "embedding_deprel", embedding_deprel, error)) return false;
21535 0 0 if (embedding_upostag) embeddings.append("universal_tag ").append(to_string(embedding_upostag)).append(" 1\n");
0 0 if (embedding_upostag) embeddings.append("universal_tag ").append(to_string(embedding_upostag)).append(" 1\n");
0 0 if (embedding_upostag) embeddings.append("universal_tag ").append(to_string(embedding_upostag)).append(" 1\n");
21536 0 0 if (embedding_feats) embeddings.append("feats ").append(to_string(embedding_feats)).append(" 1\n");
0 0 if (embedding_feats) embeddings.append("feats ").append(to_string(embedding_feats)).append(" 1\n");
0 0 if (embedding_feats) embeddings.append("feats ").append(to_string(embedding_feats)).append(" 1\n");
21537 0 0 if (embedding_xpostag) embeddings.append("tag ").append(to_string(embedding_xpostag)).append(" 1\n");
0 0 if (embedding_xpostag) embeddings.append("tag ").append(to_string(embedding_xpostag)).append(" 1\n");
0 0 if (embedding_xpostag) embeddings.append("tag ").append(to_string(embedding_xpostag)).append(" 1\n");
21538 0 0 if (embedding_form) {
21539 0 0 embeddings.append("form ").append(to_string(embedding_form)).append(" ").append(to_string(embedding_form_mincount));
0 0 embeddings.append("form ").append(to_string(embedding_form)).append(" ").append(to_string(embedding_form_mincount));
21540 0 0 if (!option_str(parser, "embedding_form_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_form_file"));
0 0 if (!option_str(parser, "embedding_form_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_form_file"));
0 0 if (!option_str(parser, "embedding_form_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_form_file"));
0 0 if (!option_str(parser, "embedding_form_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_form_file"));
0 0 if (!option_str(parser, "embedding_form_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_form_file"));
0 0 if (!option_str(parser, "embedding_form_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_form_file"));
21541 0 0 embeddings.push_back('\n');
21543 0 0 if (embedding_lemma) {
21544 0 0 embeddings.append("lemma ").append(to_string(embedding_lemma)).append(" ").append(to_string(embedding_lemma_mincount));
0 0 embeddings.append("lemma ").append(to_string(embedding_lemma)).append(" ").append(to_string(embedding_lemma_mincount));
21545 0 0 if (!option_str(parser, "embedding_lemma_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_lemma_file"));
0 0 if (!option_str(parser, "embedding_lemma_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_lemma_file"));
0 0 if (!option_str(parser, "embedding_lemma_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_lemma_file"));
0 0 if (!option_str(parser, "embedding_lemma_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_lemma_file"));
0 0 if (!option_str(parser, "embedding_lemma_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_lemma_file"));
0 0 if (!option_str(parser, "embedding_lemma_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_lemma_file"));
21546 0 0 embeddings.push_back('\n');
21548 0 0 if (embedding_deprel) embeddings.append("deprel ").append(to_string(embedding_deprel)).append(" 1\n");
0 0 if (embedding_deprel) embeddings.append("deprel ").append(to_string(embedding_deprel)).append(" 1\n");
0 0 if (embedding_deprel) embeddings.append("deprel ").append(to_string(embedding_deprel)).append(" 1\n");
21550 0 0 bool single_root = true; if (!option_bool(parser, "single_root", single_root, error)) return false;
0 0 bool single_root = true; if (!option_bool(parser, "single_root", single_root, error)) return false;
0 0 bool single_root = true; if (!option_bool(parser, "single_root", single_root, error)) return false;
21551 0 0 int iterations = 10; if (!option_int(parser, "iterations", iterations, error)) return false;
0 0 int iterations = 10; if (!option_int(parser, "iterations", iterations, error)) return false;
0 0 int iterations = 10; if (!option_int(parser, "iterations", iterations, error)) return false;
21552 0 0 int hidden_layer = 200; if (!option_int(parser, "hidden_layer", hidden_layer, error)) return false;
0 0 int hidden_layer = 200; if (!option_int(parser, "hidden_layer", hidden_layer, error)) return false;
0 0 int hidden_layer = 200; if (!option_int(parser, "hidden_layer", hidden_layer, error)) return false;
21553 0 0 int batch_size = 10; if (!option_int(parser, "batch_size", batch_size, error)) return false;
0 0 int batch_size = 10; if (!option_int(parser, "batch_size", batch_size, error)) return false;
0 0 int batch_size = 10; if (!option_int(parser, "batch_size", batch_size, error)) return false;
21554 0 0 int structured_interval = run <= 1 ? 8 : hyperparameter_integer(run,1,0,2) == 2 ? 0 : 8 + 2*hyperparameter_integer(run,1,0,2);
0 0 int structured_interval = run <= 1 ? 8 : hyperparameter_integer(run,1,0,2) == 2 ? 0 : 8 + 2*hyperparameter_integer(run,1,0,2);
21555 0 0 if (!option_int(parser, "structured_interval", structured_interval, error)) return false;
0 0 if (!option_int(parser, "structured_interval", structured_interval, error)) return false;
0 0 if (!option_int(parser, "structured_interval", structured_interval, error)) return false;
21556 0 0 double learning_rate = run <= 1 ? 0.02 : hyperparameter_logarithmic(run, 2, 0.005, 0.04);
21557 0 0 if (!option_double(parser, "learning_rate", learning_rate, error)) return false;
0 0 if (!option_double(parser, "learning_rate", learning_rate, error)) return false;
0 0 if (!option_double(parser, "learning_rate", learning_rate, error)) return false;
21558 0 0 double learning_rate_final = 0.001; if (!option_double(parser, "learning_rate_final", learning_rate_final, error)) return false;
0 0 double learning_rate_final = 0.001; if (!option_double(parser, "learning_rate_final", learning_rate_final, error)) return false;
0 0 double learning_rate_final = 0.001; if (!option_double(parser, "learning_rate_final", learning_rate_final, error)) return false;
21559 0 0 double l2 = run <= 1 ? 0.5 : hyperparameter_uniform(run, 3, 0.2, 0.6);
21560 0 0 if (!option_double(parser, "l2", l2, error)) return false;
0 0 if (!option_double(parser, "l2", l2, error)) return false;
0 0 if (!option_double(parser, "l2", l2, error)) return false;
21561 0 0 bool early_stopping = !heldout.empty(); if (!option_bool(parser, "early_stopping", early_stopping, error)) return false;
0 0 bool early_stopping = !heldout.empty(); if (!option_bool(parser, "early_stopping", early_stopping, error)) return false;
0 0 bool early_stopping = !heldout.empty(); if (!option_bool(parser, "early_stopping", early_stopping, error)) return false;
21563 0 0 if (run >= 1) cerr << "Random search run " << run << ", structured_interval=" << structured_interval
0 0 if (run >= 1) cerr << "Random search run " << run << ", structured_interval=" << structured_interval
0 0 if (run >= 1) cerr << "Random search run " << run << ", structured_interval=" << structured_interval
21589 0 0 bool use_gold_tags = false; if (!option_bool(parser, "use_gold_tags", use_gold_tags, error)) return false;
0 0 bool use_gold_tags = false; if (!option_bool(parser, "use_gold_tags", use_gold_tags, error)) return false;
0 0 bool use_gold_tags = false; if (!option_bool(parser, "use_gold_tags", use_gold_tags, error)) return false;
21590 0 0 if (!use_gold_tags && !tagger_model.empty() && tagger_model[0]) {
0 0 if (!use_gold_tags && !tagger_model.empty() && tagger_model[0]) {
0 0 if (!use_gold_tags && !tagger_model.empty() && tagger_model[0]) {
0 0 if (!use_gold_tags && !tagger_model.empty() && tagger_model[0]) {
21591 0 0 stringstream tagger_description;
21592 0 0 tagger_description.put(model_morphodita_parsito::VERSION_LATEST).put(0x7F).put(0x7F).put(0).write(tagger_model.data(), tagger_model.size()).put(0);
0 0 tagger_description.put(model_morphodita_parsito::VERSION_LATEST).put(0x7F).put(0x7F).put(0).write(tagger_model.data(), tagger_model.size()).put(0);
0 0 tagger_description.put(model_morphodita_parsito::VERSION_LATEST).put(0x7F).put(0x7F).put(0).write(tagger_model.data(), tagger_model.size()).put(0);
0 0 tagger_description.put(model_morphodita_parsito::VERSION_LATEST).put(0x7F).put(0x7F).put(0).write(tagger_model.data(), tagger_model.size()).put(0);
0 0 tagger_description.put(model_morphodita_parsito::VERSION_LATEST).put(0x7F).put(0x7F).put(0).write(tagger_model.data(), tagger_model.size()).put(0);
0 0 tagger_description.put(model_morphodita_parsito::VERSION_LATEST).put(0x7F).put(0x7F).put(0).write(tagger_model.data(), tagger_model.size()).put(0);
21593 0 0 tagger.reset(model_morphodita_parsito::load(tagger_description));
21594 0 0 if (!tagger) return error.assign("Cannot load the tagger model for parser training data generation!"), false;
0 0 if (!tagger) return error.assign("Cannot load the tagger model for parser training data generation!"), false;
21598 0 0 sentence tagged;
21600 0 0 for (auto&& sentence : training) {
21601 0 0 tagged = sentence;
21602 0 0 if (tagger && !tagger->tag(tagged, DEFAULT, error)) return false;
0 0 if (tagger && !tagger->tag(tagged, DEFAULT, error)) return false;
0 0 if (tagger && !tagger->tag(tagged, DEFAULT, error)) return false;
0 0 if (tagger && !tagger->tag(tagged, DEFAULT, error)) return false;
21604 0 0 train_trees.emplace_back();
21605 0 0 for (size_t i = 1; i < tagged.words.size(); i++) {
21607 0 0 model_normalize_form(tagged.words[i].form, train_trees.back().nodes.back().form);
21613 0 0 for (size_t i = 1; i < tagged.words.size(); i++)
21614 0 0 train_trees.back().set_head(tagged.words[i].id, tagged.words[i].head, tagged.words[i].deprel);
21619 0 0 for (auto&& sentence : heldout) {
21620 0 0 tagged = sentence;
21621 0 0 if (tagger && !tagger->tag(tagged, DEFAULT, error)) return false;
0 0 if (tagger && !tagger->tag(tagged, DEFAULT, error)) return false;
0 0 if (tagger && !tagger->tag(tagged, DEFAULT, error)) return false;
0 0 if (tagger && !tagger->tag(tagged, DEFAULT, error)) return false;
21623 0 0 heldout_trees.emplace_back();
21624 0 0 for (size_t i = 1; i < tagged.words.size(); i++) {
21626 0 0 model_normalize_form(tagged.words[i].form, heldout_trees.back().nodes.back().form);
21632 0 0 for (size_t i = 1; i < tagged.words.size(); i++)
21633 0 0 heldout_trees.back().set_head(tagged.words[i].id, tagged.words[i].head, tagged.words[i].deprel);
21637 0 0 << ", structured_interval=" << structured_interval << ", single_root=" << (single_root ? 1 : 0) << endl
0 0 << ", structured_interval=" << structured_interval << ", single_root=" << (single_root ? 1 : 0) << endl
0 0 << ", structured_interval=" << structured_interval << ", single_root=" << (single_root ? 1 : 0) << endl
21638 0 0 << "Parser uses lemmas/upos/xpos/feats: " << (tagger ? "automatically generated by tagger" : "from gold data") << endl
0 0 << "Parser uses lemmas/upos/xpos/feats: " << (tagger ? "automatically generated by tagger" : "from gold data") << endl
21639 0 0 << "Parser embeddings options: upostag=" << embedding_upostag << ", feats=" << embedding_feats << ", xpostag=" << embedding_xpostag
0 0 << "Parser embeddings options: upostag=" << embedding_upostag << ", feats=" << embedding_feats << ", xpostag=" << embedding_xpostag
0 0 << "Parser embeddings options: upostag=" << embedding_upostag << ", feats=" << embedding_feats << ", xpostag=" << embedding_xpostag
21640 0 0 << ", form=" << embedding_form << ", lemma=" << embedding_lemma << ", deprel=" << embedding_deprel << endl
0 0 << ", form=" << embedding_form << ", lemma=" << embedding_lemma << ", deprel=" << embedding_deprel << endl
0 0 << ", form=" << embedding_form << ", lemma=" << embedding_lemma << ", deprel=" << embedding_deprel << endl
21641 0 0 << " form mincount=" << embedding_form_mincount << ", precomputed form embeddings=" << (parser["embedding_form_file"].empty() ? "none" : parser["embedding_form_file"]) << endl
0 0 << " form mincount=" << embedding_form_mincount << ", precomputed form embeddings=" << (parser["embedding_form_file"].empty() ? "none" : parser["embedding_form_file"]) << endl
0 0 << " form mincount=" << embedding_form_mincount << ", precomputed form embeddings=" << (parser["embedding_form_file"].empty() ? "none" : parser["embedding_form_file"]) << endl
0 0 << " form mincount=" << embedding_form_mincount << ", precomputed form embeddings=" << (parser["embedding_form_file"].empty() ? "none" : parser["embedding_form_file"]) << endl
0 0 << " form mincount=" << embedding_form_mincount << ", precomputed form embeddings=" << (parser["embedding_form_file"].empty() ? "none" : parser["embedding_form_file"]) << endl
0 0 << " form mincount=" << embedding_form_mincount << ", precomputed form embeddings=" << (parser["embedding_form_file"].empty() ? "none" : parser["embedding_form_file"]) << endl
0 0 << " form mincount=" << embedding_form_mincount << ", precomputed form embeddings=" << (parser["embedding_form_file"].empty() ? "none" : parser["embedding_form_file"]) << endl
21642 0 0 << " lemma mincount=" << embedding_lemma_mincount << ", precomputed lemma embeddings=" << (parser["embedding_lemma_file"].empty() ? "none" : parser["embedding_lemma_file"]) << endl
0 0 << " lemma mincount=" << embedding_lemma_mincount << ", precomputed lemma embeddings=" << (parser["embedding_lemma_file"].empty() ? "none" : parser["embedding_lemma_file"]) << endl
0 0 << " lemma mincount=" << embedding_lemma_mincount << ", precomputed lemma embeddings=" << (parser["embedding_lemma_file"].empty() ? "none" : parser["embedding_lemma_file"]) << endl
0 0 << " lemma mincount=" << embedding_lemma_mincount << ", precomputed lemma embeddings=" << (parser["embedding_lemma_file"].empty() ? "none" : parser["embedding_lemma_file"]) << endl
0 0 << " lemma mincount=" << embedding_lemma_mincount << ", precomputed lemma embeddings=" << (parser["embedding_lemma_file"].empty() ? "none" : parser["embedding_lemma_file"]) << endl
0 0 << " lemma mincount=" << embedding_lemma_mincount << ", precomputed lemma embeddings=" << (parser["embedding_lemma_file"].empty() ? "none" : parser["embedding_lemma_file"]) << endl
0 0 << " lemma mincount=" << embedding_lemma_mincount << ", precomputed lemma embeddings=" << (parser["embedding_lemma_file"].empty() ? "none" : parser["embedding_lemma_file"]) << endl
21643 0 0 << "Parser network options: iterations=" << iterations << ", hidden_layer=" << hidden_layer << ", batch_size=" << batch_size << "," << endl
0 0 << "Parser network options: iterations=" << iterations << ", hidden_layer=" << hidden_layer << ", batch_size=" << batch_size << "," << endl
0 0 << "Parser network options: iterations=" << iterations << ", hidden_layer=" << hidden_layer << ", batch_size=" << batch_size << "," << endl
21645 0 0 << ", l2=" << l2 << ", early_stopping=" << (early_stopping ? 1 : 0) << endl;
0 0 << ", l2=" << l2 << ", early_stopping=" << (early_stopping ? 1 : 0) << endl;
21648 0 0 binary_encoder enc;
21649 0 0 enc.add_str("nn_versioned");
21651 0 0 parameters, 1, train_trees, heldout_trees, enc);
21652 0 0 compressor::save(os, enc);
21664 0 0 if (!is.get(len)) return false;
0 0 if (!is.get(len)) return false;
21666 0 0 if (!is.read(&name[0], len)) return false;
0 0 if (!is.read(&name[0], len)) return false;
21667 0 0 if (name != "morphodita_parsito") return false;
21670 0 0 if (!is.get(version)) return false;
0 0 if (!is.get(version)) return false;
21671 0 0 if (!(version >= 1 && version <= model_morphodita_parsito::VERSION_LATEST)) return false;
21676 0 0 if (version >= 2) {
21678 0 0 if (!is.get(sentinel) || sentinel != 0x7F) return false;
0 0 if (!is.get(sentinel) || sentinel != 0x7F) return false;
0 0 if (!is.get(sentinel) || sentinel != 0x7F) return false;
0 0 if (!is.get(sentinel) || sentinel != 0x7F) return false;
21679 0 0 if (!is.get(sentinel) || sentinel != 0x7F) return false;
0 0 if (!is.get(sentinel) || sentinel != 0x7F) return false;
0 0 if (!is.get(sentinel) || sentinel != 0x7F) return false;
0 0 if (!is.get(sentinel) || sentinel != 0x7F) return false;
21684 0 0 if (model == TOKENIZER_MODEL) range.str = data.data() + is.tellg();
0 0 if (model == TOKENIZER_MODEL) range.str = data.data() + is.tellg();
21685 0 0 char tokenizer; if (!is.get(tokenizer)) return false;
0 0 char tokenizer; if (!is.get(tokenizer)) return false;
21686 0 0 unique_ptr tokenizer_factory(tokenizer ? morphodita::tokenizer_factory::load(is) : nullptr);
0 0 unique_ptr tokenizer_factory(tokenizer ? morphodita::tokenizer_factory::load(is) : nullptr);
21687 0 0 if (tokenizer && !tokenizer_factory) return false;
0 0 if (tokenizer && !tokenizer_factory) return false;
0 0 if (tokenizer && !tokenizer_factory) return false;
21688 0 0 unique_ptr splitter(tokenizer ? multiword_splitter::load(is) : nullptr);
0 0 unique_ptr splitter(tokenizer ? multiword_splitter::load(is) : nullptr);
21689 0 0 if (model == TOKENIZER_MODEL) return range.len = is.tellg() - streampos(range.str - data.data()), true;
0 0 if (model == TOKENIZER_MODEL) return range.len = is.tellg() - streampos(range.str - data.data()), true;
21694 0 0 if (model == TAGGER_MODEL) range.str = data.data() + is.tellg();
0 0 if (model == TAGGER_MODEL) range.str = data.data() + is.tellg();
21695 0 0 char taggers; if (!is.get(taggers)) return false;
0 0 char taggers; if (!is.get(taggers)) return false;
21696 0 0 for (char i = 0; i < taggers; i++) {
21697 0 0 char lemma; if (!is.get(lemma)) return false;
0 0 char lemma; if (!is.get(lemma)) return false;
21698 0 0 char xpostag; if (!is.get(xpostag)) return false;
0 0 char xpostag; if (!is.get(xpostag)) return false;
21699 0 0 char feats; if (!is.get(feats)) return false;
0 0 char feats; if (!is.get(feats)) return false;
21700 0 0 unique_ptr tagger(morphodita::tagger::load(is));
21701 0 0 if (!tagger) return false;
21703 0 0 if (model == TAGGER_MODEL) return range.len = is.tellg() - streampos(range.str - data.data()), true;
0 0 if (model == TAGGER_MODEL) return range.len = is.tellg() - streampos(range.str - data.data()), true;
21708 0 0 if (model == PARSER_MODEL) range.str = data.data() + is.tellg();
0 0 if (model == PARSER_MODEL) range.str = data.data() + is.tellg();
21710 0 0 if (!is.get(parser)) return false;
0 0 if (!is.get(parser)) return false;
21711 0 0 unique_ptr parser_model(parser ? parsito::parser::load(is) : nullptr);
0 0 unique_ptr parser_model(parser ? parsito::parser::load(is) : nullptr);
21712 0 0 if (parser && !parser_model) return false;
0 0 if (parser && !parser_model) return false;
0 0 if (parser && !parser_model) return false;
21713 0 0 if (model == PARSER_MODEL) return range.len = is.tellg() - streampos(range.str - data.data()), true;
0 0 if (model == PARSER_MODEL) return range.len = is.tellg() - streampos(range.str - data.data()), true;
21720 0 0 return model_morphodita_parsito(model_morphodita_parsito::VERSION_LATEST).normalize_form(form, output);
21724 0 0 return model_morphodita_parsito(model_morphodita_parsito::VERSION_LATEST).normalize_lemma(lemma, output);
21728 0 0 model_morphodita_parsito(model_morphodita_parsito::VERSION_LATEST).fill_word_analysis(analysis, false, upostag, lemma, xpostag, feats, word);
21736 0 0 unique_ptr conllu_input_format(input_format::new_conllu_input_format());
21738 0 0 int run = 0; if (!option_int(tagger, "run", run, error, model)) return false;
0 0 int run = 0; if (!option_int(tagger, "run", run, error, model)) return false;
0 0 int run = 0; if (!option_int(tagger, "run", run, error, model)) return false;
21741 0 0 for (auto&& sentence : training)
21742 0 0 for (size_t i = 1; !have_lemma && i < sentence.words.size(); i++)
0 0 for (size_t i = 1; !have_lemma && i < sentence.words.size(); i++)
0 0 for (size_t i = 1; !have_lemma && i < sentence.words.size(); i++)
21743 0 0 if (!sentence.words[i].lemma.empty() && sentence.words[i].lemma != "_")
0 0 if (!sentence.words[i].lemma.empty() && sentence.words[i].lemma != "_")
0 0 if (!sentence.words[i].lemma.empty() && sentence.words[i].lemma != "_")
21745 0 0 bool use_lemma_flag = model == 1 || models == 1; if (!option_bool(tagger, "use_lemma", use_lemma_flag, error, model)) return false;
0 0 bool use_lemma_flag = model == 1 || models == 1; if (!option_bool(tagger, "use_lemma", use_lemma_flag, error, model)) return false;
0 0 bool use_lemma_flag = model == 1 || models == 1; if (!option_bool(tagger, "use_lemma", use_lemma_flag, error, model)) return false;
21746 0 0 int lemma_encoding = 2; if (!option_int(tagger, "dictionary_lemma_encoding", lemma_encoding, error, model)) return false;
0 0 int lemma_encoding = 2; if (!option_int(tagger, "dictionary_lemma_encoding", lemma_encoding, error, model)) return false;
0 0 int lemma_encoding = 2; if (!option_int(tagger, "dictionary_lemma_encoding", lemma_encoding, error, model)) return false;
21747 0 0 int use_lemma = have_lemma && use_lemma_flag ? lemma_encoding : 0;
0 0 int use_lemma = have_lemma && use_lemma_flag ? lemma_encoding : 0;
21748 0 0 bool use_xpostag = model == 0; if (!option_bool(tagger, "use_xpostag", use_xpostag, error, model)) return false;
0 0 bool use_xpostag = model == 0; if (!option_bool(tagger, "use_xpostag", use_xpostag, error, model)) return false;
0 0 bool use_xpostag = model == 0; if (!option_bool(tagger, "use_xpostag", use_xpostag, error, model)) return false;
21749 0 0 bool use_feats = model == 0; if (!option_bool(tagger, "use_feats", use_feats, error, model)) return false;
0 0 bool use_feats = model == 0; if (!option_bool(tagger, "use_feats", use_feats, error, model)) return false;
0 0 bool use_feats = model == 0; if (!option_bool(tagger, "use_feats", use_feats, error, model)) return false;
21751 0 0 bool provide_lemma = model == 1 || models == 1; if (!option_bool(tagger, "provide_lemma", provide_lemma, error, model)) return false;
0 0 bool provide_lemma = model == 1 || models == 1; if (!option_bool(tagger, "provide_lemma", provide_lemma, error, model)) return false;
0 0 bool provide_lemma = model == 1 || models == 1; if (!option_bool(tagger, "provide_lemma", provide_lemma, error, model)) return false;
21752 0 0 bool provide_xpostag = model == 0; if (!option_bool(tagger, "provide_xpostag", provide_xpostag, error, model)) return false;
0 0 bool provide_xpostag = model == 0; if (!option_bool(tagger, "provide_xpostag", provide_xpostag, error, model)) return false;
0 0 bool provide_xpostag = model == 0; if (!option_bool(tagger, "provide_xpostag", provide_xpostag, error, model)) return false;
21753 0 0 bool provide_feats = model == 0; if (!option_bool(tagger, "provide_feats", provide_feats, error, model)) return false;
0 0 bool provide_feats = model == 0; if (!option_bool(tagger, "provide_feats", provide_feats, error, model)) return false;
0 0 bool provide_feats = model == 0; if (!option_bool(tagger, "provide_feats", provide_feats, error, model)) return false;
21754 0 0 os.put(char(provide_lemma ? use_lemma : 0));
0 0 os.put(char(provide_lemma ? use_lemma : 0));
21755 0 0 os.put(char(provide_xpostag && use_xpostag));
0 0 os.put(char(provide_xpostag && use_xpostag));
0 0 os.put(char(provide_xpostag && use_xpostag));
21756 0 0 os.put(char(provide_feats && use_feats));
0 0 os.put(char(provide_feats && use_feats));
0 0 os.put(char(provide_feats && use_feats));
21758 0 0 cerr << "Tagger model " << model+1 << " columns: " << "lemma use=" << (use_lemma ? 1 : 0) << "/provide=" << (provide_lemma ? 1 : 0)
0 0 cerr << "Tagger model " << model+1 << " columns: " << "lemma use=" << (use_lemma ? 1 : 0) << "/provide=" << (provide_lemma ? 1 : 0)
0 0 cerr << "Tagger model " << model+1 << " columns: " << "lemma use=" << (use_lemma ? 1 : 0) << "/provide=" << (provide_lemma ? 1 : 0)
0 0 cerr << "Tagger model " << model+1 << " columns: " << "lemma use=" << (use_lemma ? 1 : 0) << "/provide=" << (provide_lemma ? 1 : 0)
21759 0 0 << ", xpostag use=" << (use_xpostag ? 1 : 0) << "/provide=" << (provide_xpostag ? 1 : 0)
0 0 << ", xpostag use=" << (use_xpostag ? 1 : 0) << "/provide=" << (provide_xpostag ? 1 : 0)
0 0 << ", xpostag use=" << (use_xpostag ? 1 : 0) << "/provide=" << (provide_xpostag ? 1 : 0)
0 0 << ", xpostag use=" << (use_xpostag ? 1 : 0) << "/provide=" << (provide_xpostag ? 1 : 0)
21760 0 0 << ", feats use=" << (use_feats ? 1 : 0) << "/provide=" << (provide_feats ? 1 : 0) << endl;
0 0 << ", feats use=" << (use_feats ? 1 : 0) << "/provide=" << (provide_feats ? 1 : 0) << endl;
0 0 << ", feats use=" << (use_feats ? 1 : 0) << "/provide=" << (provide_feats ? 1 : 0) << endl;
0 0 << ", feats use=" << (use_feats ? 1 : 0) << "/provide=" << (provide_feats ? 1 : 0) << endl;
21763 0 0 stringstream morpho_description;
21767 0 0 const string& dictionary_model = option_str(tagger, "dictionary_model", model);
0 0 const string& dictionary_model = option_str(tagger, "dictionary_model", model);
21768 0 0 if (!dictionary_model.empty()) {
21777 0 0 int dictionary_suffix_len = 8; if (!option_int(tagger, "dictionary_suffix_len", dictionary_suffix_len, error, model)) return false;
0 0 int dictionary_suffix_len = 8; if (!option_int(tagger, "dictionary_suffix_len", dictionary_suffix_len, error, model)) return false;
0 0 int dictionary_suffix_len = 8; if (!option_int(tagger, "dictionary_suffix_len", dictionary_suffix_len, error, model)) return false;
21779 0 0 if (!option_str(tagger, "dictionary_flat_lemmas", model).empty()) {
0 0 if (!option_str(tagger, "dictionary_flat_lemmas", model).empty()) {
0 0 if (!option_str(tagger, "dictionary_flat_lemmas", model).empty()) {
21781 0 0 split(option_str(tagger, "dictionary_flat_lemmas", model), ',', lemmas);
0 0 split(option_str(tagger, "dictionary_flat_lemmas", model), ',', lemmas);
0 0 split(option_str(tagger, "dictionary_flat_lemmas", model), ',', lemmas);
21782 0 0 for (auto&& lemma : lemmas) {
21783 0 0 if (lemma.find('~') != string::npos)
21784 0 0 return error.assign("Dictionary_flat_lemmas cannot contain '~' character!"), false;
21788 0 0 flat_lemmas.insert("greek.expression");
21791 0 0 if (!option_str(tagger, "dictionary", model).empty())
0 0 if (!option_str(tagger, "dictionary", model).empty())
0 0 if (!option_str(tagger, "dictionary", model).empty())
21792 0 0 return error.assign("The tagger 'dictionary' option is no longer supported, use 'dictionary_file' instead!"), false;
21793 0 0 const string& dictionary_file = option_str(tagger, "dictionary_file", model);
0 0 const string& dictionary_file = option_str(tagger, "dictionary_file", model);
21794 0 0 int max_form_analyses = 0; if (!option_int(tagger, "dictionary_max_form_analyses", max_form_analyses, error, model)) return false;
0 0 int max_form_analyses = 0; if (!option_int(tagger, "dictionary_max_form_analyses", max_form_analyses, error, model)) return false;
0 0 int max_form_analyses = 0; if (!option_int(tagger, "dictionary_max_form_analyses", max_form_analyses, error, model)) return false;
21796 0 0 cerr << "Tagger model " << model+1 << " dictionary options: " << "max_form_analyses=" << max_form_analyses
21797 0 0 << ", custom dictionary_file=" << (dictionary_file.empty() ? "none" : dictionary_file) << endl;
0 0 << ", custom dictionary_file=" << (dictionary_file.empty() ? "none" : dictionary_file) << endl;
21800 0 0 int guesser_suffix_len = 4; if (!option_int(tagger, "guesser_suffix_len", guesser_suffix_len, error, model)) return false;
0 0 int guesser_suffix_len = 4; if (!option_int(tagger, "guesser_suffix_len", guesser_suffix_len, error, model)) return false;
0 0 int guesser_suffix_len = 4; if (!option_int(tagger, "guesser_suffix_len", guesser_suffix_len, error, model)) return false;
21801 0 0 int guesser_suffix_rules = run <= 1 ? 8 : 5 + hyperparameter_integer(run, 1, 0, 7);
21802 0 0 if (!option_int(tagger, "guesser_suffix_rules", guesser_suffix_rules, error, model)) return false;
0 0 if (!option_int(tagger, "guesser_suffix_rules", guesser_suffix_rules, error, model)) return false;
0 0 if (!option_int(tagger, "guesser_suffix_rules", guesser_suffix_rules, error, model)) return false;
21803 0 0 int guesser_prefixes_max = provide_lemma ? 4 : 0; if (!option_int(tagger, "guesser_prefixes_max", guesser_prefixes_max, error, model)) return false;
0 0 int guesser_prefixes_max = provide_lemma ? 4 : 0; if (!option_int(tagger, "guesser_prefixes_max", guesser_prefixes_max, error, model)) return false;
0 0 int guesser_prefixes_max = provide_lemma ? 4 : 0; if (!option_int(tagger, "guesser_prefixes_max", guesser_prefixes_max, error, model)) return false;
0 0 int guesser_prefixes_max = provide_lemma ? 4 : 0; if (!option_int(tagger, "guesser_prefixes_max", guesser_prefixes_max, error, model)) return false;
21804 0 0 int guesser_prefix_min_count = 10; if (!option_int(tagger, "guesser_prefix_min_count", guesser_prefix_min_count, error, model)) return false;
0 0 int guesser_prefix_min_count = 10; if (!option_int(tagger, "guesser_prefix_min_count", guesser_prefix_min_count, error, model)) return false;
0 0 int guesser_prefix_min_count = 10; if (!option_int(tagger, "guesser_prefix_min_count", guesser_prefix_min_count, error, model)) return false;
21805 0 0 int guesser_enrich_dictionary = run <= 1 ? 6 : 3 + hyperparameter_integer(run, 2, 0, 7);
21806 0 0 if (!dictionary_file.empty()) guesser_enrich_dictionary = 0;
21807 0 0 if (!option_int(tagger, "guesser_enrich_dictionary", guesser_enrich_dictionary, error, model)) return false;
0 0 if (!option_int(tagger, "guesser_enrich_dictionary", guesser_enrich_dictionary, error, model)) return false;
0 0 if (!option_int(tagger, "guesser_enrich_dictionary", guesser_enrich_dictionary, error, model)) return false;
21809 0 0 if (run >= 1) cerr << "Random search run " << run << ", guesser_suffix_rules=" << guesser_suffix_rules
0 0 if (run >= 1) cerr << "Random search run " << run << ", guesser_suffix_rules=" << guesser_suffix_rules
0 0 if (run >= 1) cerr << "Random search run " << run << ", guesser_suffix_rules=" << guesser_suffix_rules
21810 0 0 << ", guesser_enrich_dictionary=" << guesser_enrich_dictionary << endl;
21812 0 0 cerr << "Tagger model " << model+1 << " guesser options: " << "suffix_rules=" << guesser_suffix_rules
21813 0 0 << ", prefixes_max=" << guesser_prefixes_max << ", prefix_min_count=" << guesser_prefix_min_count
0 0 << ", prefixes_max=" << guesser_prefixes_max << ", prefix_min_count=" << guesser_prefix_min_count
21814 0 0 << ", enrich_dictionary=" << guesser_enrich_dictionary << endl;
21817 0 0 stringstream guesser_description;
21819 0 0 stringstream guesser_input;
21820 0 0 for (auto&& sentence : training) {
21821 0 0 for (size_t i = 1; i < sentence.words.size(); i++)
21822 0 0 guesser_input << model_normalize_form(sentence.words[i].form, normalized_form) << '\t'
21823 0 0 << combine_lemma(sentence.words[i], use_lemma, combined_lemma, flat_lemmas) << '\t'
21824 0 0 << combine_tag(sentence.words[i], use_xpostag, use_feats, combined_tag) << '\n';
21827 0 0 morphodita::morpho_statistical_guesser_trainer::train(guesser_input, guesser_suffix_len, guesser_suffix_rules, guesser_prefixes_max, guesser_prefix_min_count, guesser_description);
21835 0 0 for (auto&& sentence : training)
21836 0 0 for (size_t i = 1; i < sentence.words.size(); i++) {
21837 0 0 model_normalize_form(sentence.words[i].form, normalized_form);
21838 0 0 entry.assign(combine_lemma(sentence.words[i], use_lemma, combined_lemma, flat_lemmas))
21839 0 0 .append("\t").append(combine_tag(sentence.words[i], use_xpostag, use_feats, combined_tag))
0 0 .append("\t").append(combine_tag(sentence.words[i], use_xpostag, use_feats, combined_tag))
21840 0 0 .append("\t").append(normalized_form);
21845 0 0 for (auto&& form_analyses : entries) {
21847 0 0 for (auto&& analysis : form_analyses.second)
21848 0 0 analyses.emplace_back(analysis.second, analysis.first);
21849 0 0 if (max_form_analyses && int(analyses.size()) > max_form_analyses) {
0 0 if (max_form_analyses && int(analyses.size()) > max_form_analyses) {
0 0 if (max_form_analyses && int(analyses.size()) > max_form_analyses) {
21851 0 0 analyses.resize(max_form_analyses);
21853 0 0 for (auto&& analysis : analyses)
21859 0 0 dictionary_special_tags.number_tag = most_frequent_tag(training, "NUM", use_xpostag, use_feats, combined_tag);
0 0 dictionary_special_tags.number_tag = most_frequent_tag(training, "NUM", use_xpostag, use_feats, combined_tag);
21860 0 0 dictionary_special_tags.punctuation_tag = most_frequent_tag(training, "PUNCT", use_xpostag, use_feats, combined_tag);
0 0 dictionary_special_tags.punctuation_tag = most_frequent_tag(training, "PUNCT", use_xpostag, use_feats, combined_tag);
21861 0 0 dictionary_special_tags.symbol_tag = most_frequent_tag(training, "SYM", use_xpostag, use_feats, combined_tag);
0 0 dictionary_special_tags.symbol_tag = most_frequent_tag(training, "SYM", use_xpostag, use_feats, combined_tag);
21864 0 0 if (!dictionary_file.empty()) {
21865 0 0 ifstream is(path_from_utf8(dictionary_file).c_str());
21866 0 0 if (!is.is_open()) return error.assign("Cannot open dictionary_file '").append(dictionary_file).append("'!"), false;
0 0 if (!is.is_open()) return error.assign("Cannot open dictionary_file '").append(dictionary_file).append("'!"), false;
0 0 if (!is.is_open()) return error.assign("Cannot open dictionary_file '").append(dictionary_file).append("'!"), false;
21869 0 0 word entry;
21871 0 0 while (getline(is, line)) {
0 0 while (getline(is, line)) {
21873 0 0 if (line.empty()) continue;
21875 0 0 split(line, '\t', dictionary_parts);
21877 0 0 if (dictionary_parts.size() != 5)
21878 0 0 return error.assign("Dictionary line '").append(line).append("' does not contain 5 tab-separated columns!"), false;
0 0 return error.assign("Dictionary line '").append(line).append("' does not contain 5 tab-separated columns!"), false;
21880 0 0 model_normalize_form(dictionary_parts[0], entry.form);
21881 0 0 entry.lemma.assign(dictionary_parts[1].str, dictionary_parts[1].len == 1 && dictionary_parts[1].str[0] == '_' ? 0 : dictionary_parts[1].len);
0 0 entry.lemma.assign(dictionary_parts[1].str, dictionary_parts[1].len == 1 && dictionary_parts[1].str[0] == '_' ? 0 : dictionary_parts[1].len);
21882 0 0 entry.upostag.assign(dictionary_parts[2].str, dictionary_parts[2].len == 1 && dictionary_parts[2].str[0] == '_' ? 0 : dictionary_parts[2].len);
0 0 entry.upostag.assign(dictionary_parts[2].str, dictionary_parts[2].len == 1 && dictionary_parts[2].str[0] == '_' ? 0 : dictionary_parts[2].len);
21883 0 0 entry.xpostag.assign(dictionary_parts[3].str, dictionary_parts[3].len == 1 && dictionary_parts[3].str[0] == '_' ? 0 : dictionary_parts[3].len);
0 0 entry.xpostag.assign(dictionary_parts[3].str, dictionary_parts[3].len == 1 && dictionary_parts[3].str[0] == '_' ? 0 : dictionary_parts[3].len);
21884 0 0 entry.feats.assign(dictionary_parts[4].str, dictionary_parts[4].len == 1 && dictionary_parts[4].str[0] == '_' ? 0 : dictionary_parts[4].len);
0 0 entry.feats.assign(dictionary_parts[4].str, dictionary_parts[4].len == 1 && dictionary_parts[4].str[0] == '_' ? 0 : dictionary_parts[4].len);
21886 0 0 entry_encoded.assign(combine_lemma(entry, use_lemma, combined_lemma, flat_lemmas))
21887 0 0 .append("\t").append(combine_tag(entry, use_xpostag, use_feats, combined_tag))
0 0 .append("\t").append(combine_tag(entry, use_xpostag, use_feats, combined_tag))
21888 0 0 .append("\t").append(entry.form);
21894 0 0 if (guesser_enrich_dictionary) {
21896 0 0 stringstream empty_data, guesser_description_copy(guesser_description.str()), guesser_only_morphology;
0 0 stringstream empty_data, guesser_description_copy(guesser_description.str()), guesser_only_morphology;
0 0 stringstream empty_data, guesser_description_copy(guesser_description.str()), guesser_only_morphology;
21897 0 0 guesser_only_morphology.put(morphodita::morpho_ids::GENERIC);
21898 0 0 morphodita::generic_morpho_encoder::encode(empty_data, dictionary_suffix_len, dictionary_special_tags, guesser_description_copy, guesser_only_morphology);
21900 0 0 unique_ptr guesser_only_morpho(morphodita::morpho::load(guesser_only_morphology));
21901 0 0 if (!guesser_only_morpho) return error.assign("Cannot create temporary guesser-only morphology!"), false;
0 0 if (!guesser_only_morpho) return error.assign("Cannot create temporary guesser-only morphology!"), false;
21906 0 0 for (auto&& sentence : training)
21907 0 0 for (size_t i = 1; i < sentence.words.size(); i++) {
21908 0 0 const auto& form = model_normalize_form(sentence.words[i].form, normalized_form);
21909 0 0 if (!analyzed_forms.count(form)) {
21910 0 0 guesser_only_morpho->analyze(form, morphodita::morpho::GUESSER, analyses);
21913 0 0 for (auto&& analyse : analyses) {
21914 0 0 entry.assign(analyse.lemma).push_back('\t');
21915 0 0 entry.append(analyse.tag).push_back('\t');
21917 0 0 if (dictionary_entries.insert(entry).second)
21918 0 0 if (!--to_add)
21927 0 0 vector sorted_dictionary(dictionary_entries.begin(), dictionary_entries.end());
21930 0 0 stringstream morpho_input;
21931 0 0 for (auto&& entry : sorted_dictionary)
21934 0 0 morpho_description.put(morphodita::morpho_ids::GENERIC);
21935 0 0 morphodita::generic_morpho_encoder::encode(morpho_input, dictionary_suffix_len, dictionary_special_tags, guesser_description, morpho_description);
21939 0 0 const string& dictionary_accuracy = option_str(tagger, "dictionary_accuracy", model);
0 0 const string& dictionary_accuracy = option_str(tagger, "dictionary_accuracy", model);
21940 0 0 if (!dictionary_accuracy.empty()) {
21941 0 0 unique_ptr morpho(morphodita::morpho::load(morpho_description));
21942 0 0 if (!morpho) return error.assign("Cannot create temporary morphology for evaluating accuracy!"), false;
0 0 if (!morpho) return error.assign("Cannot create temporary morphology for evaluating accuracy!"), false;
21943 0 0 morpho_description.seekg(0, ios::beg);
21948 0 0 word w;
21950 0 0 conllu_input_format->set_text(dictionary_accuracy.c_str());
21951 0 0 for (sentence sentence; conllu_input_format->next_sentence(sentence, error); )
0 0 for (sentence sentence; conllu_input_format->next_sentence(sentence, error); )
0 0 for (sentence sentence; conllu_input_format->next_sentence(sentence, error); )
21952 0 0 for (size_t i = 1; i < sentence.words.size(); i++) {
21953 0 0 morpho->analyze(model_normalize_form(sentence.words[i].form, normalized_form), morphodita::morpho::GUESSER, analyses);
0 0 morpho->analyze(model_normalize_form(sentence.words[i].form, normalized_form), morphodita::morpho::GUESSER, analyses);
21955 0 0 for (auto&& analysis : analyses) {
21956 0 0 w.lemma.assign("_");
21957 0 0 model_fill_word_analysis(analysis, true, use_lemma, true, true, w);
21961 0 0 all_tags_ok |= int(sentence.words[i].upostag == w.upostag && sentence.words[i].xpostag == w.xpostag && sentence.words[i].feats == w.feats);
0 0 all_tags_ok |= int(sentence.words[i].upostag == w.upostag && sentence.words[i].xpostag == w.xpostag && sentence.words[i].feats == w.feats);
0 0 all_tags_ok |= int(sentence.words[i].upostag == w.upostag && sentence.words[i].xpostag == w.xpostag && sentence.words[i].feats == w.feats);
21972 0 0 if (!error.empty()) return false;
21981 0 0 double tagger_order = 3; if (!option_double(tagger, "order", tagger_order, error, model)) return false;
0 0 double tagger_order = 3; if (!option_double(tagger, "order", tagger_order, error, model)) return false;
0 0 double tagger_order = 3; if (!option_double(tagger, "order", tagger_order, error, model)) return false;
21983 0 0 if (tagger_order == 2) tagger_id = morphodita::tagger_ids::CONLLU2;
21984 0 0 else if (tagger_order == 2.5) tagger_id = morphodita::tagger_ids::CONLLU2_3;
21985 0 0 else if (tagger_order == 3) tagger_id = morphodita::tagger_ids::CONLLU3;
21986 0 0 else return error.assign("The tagger_order can be only 2, 2.5 or 3!"), false;
21988 0 0 int tagger_iterations = 20; if (!option_int(tagger, "iterations", tagger_iterations, error, model)) return false;
0 0 int tagger_iterations = 20; if (!option_int(tagger, "iterations", tagger_iterations, error, model)) return false;
0 0 int tagger_iterations = 20; if (!option_int(tagger, "iterations", tagger_iterations, error, model)) return false;
21989 0 0 bool tagger_prune_features = false; if (!option_bool(tagger, "prune_features", tagger_prune_features, error, model)) return false;
0 0 bool tagger_prune_features = false; if (!option_bool(tagger, "prune_features", tagger_prune_features, error, model)) return false;
0 0 bool tagger_prune_features = false; if (!option_bool(tagger, "prune_features", tagger_prune_features, error, model)) return false;
21990 0 0 bool tagger_early_stopping = true; if (!option_bool(tagger, "early_stopping", tagger_early_stopping, error, model)) return false;
0 0 bool tagger_early_stopping = true; if (!option_bool(tagger, "early_stopping", tagger_early_stopping, error, model)) return false;
0 0 bool tagger_early_stopping = true; if (!option_bool(tagger, "early_stopping", tagger_early_stopping, error, model)) return false;
21992 0 0 option_str(tagger, "templates", model) == "tagger" ? tagger_features_tagger :
0 0 option_str(tagger, "templates", model) == "tagger" ? tagger_features_tagger :
21993 0 0 option_str(tagger, "templates", model) == "lemmatizer" ? tagger_features_lemmatizer :
0 0 option_str(tagger, "templates", model) == "lemmatizer" ? tagger_features_lemmatizer :
0 0 option_str(tagger, "templates", model) == "lemmatizer" ? tagger_features_lemmatizer :
0 0 option_str(tagger, "templates", model) == "lemmatizer" ? tagger_features_lemmatizer :
21994 0 0 !option_str(tagger, "templates", model).empty() ? option_str(tagger, "templates", model) :
0 0 !option_str(tagger, "templates", model).empty() ? option_str(tagger, "templates", model) :
0 0 !option_str(tagger, "templates", model).empty() ? option_str(tagger, "templates", model) :
0 0 !option_str(tagger, "templates", model).empty() ? option_str(tagger, "templates", model) :
0 0 !option_str(tagger, "templates", model).empty() ? option_str(tagger, "templates", model) :
0 0 !option_str(tagger, "templates", model).empty() ? option_str(tagger, "templates", model) :
0 0 !option_str(tagger, "templates", model).empty() ? option_str(tagger, "templates", model) :
21995 0 0 model == 1 ? tagger_features_lemmatizer : tagger_features_tagger;
0 0 model == 1 ? tagger_features_lemmatizer : tagger_features_tagger;
0 0 model == 1 ? tagger_features_lemmatizer : tagger_features_tagger;
0 0 model == 1 ? tagger_features_lemmatizer : tagger_features_tagger;
0 0 model == 1 ? tagger_features_lemmatizer : tagger_features_tagger;
21996 0 0 if (heldout.empty()) tagger_early_stopping = false;
21998 0 0 cerr << "Tagger model " << model+1 << " options: iterations=" << tagger_iterations
21999 0 0 << ", early_stopping=" << (tagger_early_stopping ? 1 : 0) << ", templates="
0 0 << ", early_stopping=" << (tagger_early_stopping ? 1 : 0) << ", templates="
22001 0 0 tagger_feature_templates == tagger_features_lemmatizer ? "lemmatizer" : "custom") << endl;
0 0 tagger_feature_templates == tagger_features_lemmatizer ? "lemmatizer" : "custom") << endl;
0 0 tagger_feature_templates == tagger_features_lemmatizer ? "lemmatizer" : "custom") << endl;
22005 0 0 stringstream input, heldout_input, feature_templates_input(tagger_feature_templates);
0 0 stringstream input, heldout_input, feature_templates_input(tagger_feature_templates);
0 0 stringstream input, heldout_input, feature_templates_input(tagger_feature_templates);
22006 0 0 for (auto&& sentence : training) {
22007 0 0 for (size_t i = 1; i < sentence.words.size(); i++)
22008 0 0 input << model_normalize_form(sentence.words[i].form, normalized_form) << '\t'
22009 0 0 << combine_lemma(sentence.words[i], use_lemma, combined_lemma) << '\t'
22010 0 0 << combine_tag(sentence.words[i], use_xpostag, use_feats, combined_tag) << '\n';
22014 0 0 for (auto&& sentence : heldout) {
22015 0 0 for (size_t i = 1; i < sentence.words.size(); i++)
22016 0 0 heldout_input << model_normalize_form(sentence.words[i].form, normalized_form) << '\t'
22017 0 0 << combine_lemma(sentence.words[i], use_lemma, combined_lemma) << '\t'
22018 0 0 << combine_tag(sentence.words[i], use_xpostag, use_feats, combined_tag) << '\n';
22022 0 0 os.put(tagger_id);
22023 0 0 morphodita::tagger_trainer>>::train(morphodita::tagger_ids::decoding_order(tagger_id), morphodita::tagger_ids::window_size(tagger_id), tagger_iterations, morpho_description, true, feature_templates_input, tagger_prune_features, input, heldout_input, tagger_early_stopping, os);
0 0 morphodita::tagger_trainer>>::train(morphodita::tagger_ids::decoding_order(tagger_id), morphodita::tagger_ids::window_size(tagger_id), tagger_iterations, morpho_description, true, feature_templates_input, tagger_prune_features, input, heldout_input, tagger_early_stopping, os);
0 0 morphodita::tagger_trainer>>::train(morphodita::tagger_ids::decoding_order(tagger_id), morphodita::tagger_ids::window_size(tagger_id), tagger_iterations, morpho_description, true, feature_templates_input, tagger_prune_features, input, heldout_input, tagger_early_stopping, os);
0 0 morphodita::tagger_trainer>>::train(morphodita::tagger_ids::decoding_order(tagger_id), morphodita::tagger_ids::window_size(tagger_id), tagger_iterations, morpho_description, true, feature_templates_input, tagger_prune_features, input, heldout_input, tagger_early_stopping, os);
0 0 morphodita::tagger_trainer>>::train(morphodita::tagger_ids::decoding_order(tagger_id), morphodita::tagger_ids::window_size(tagger_id), tagger_iterations, morpho_description, true, feature_templates_input, tagger_prune_features, input, heldout_input, tagger_early_stopping, os);
22032 0 0 while (separator < tag_separators.size() &&
22033 0 0 (w.upostag.find(tag_separators[separator]) != string::npos || w.xpostag.find(tag_separators[separator]) != string::npos))
22036 0 0 if (separator >= tag_separators.size()) {
22045 0 0 while (separator < tag_separators.size() &&
22046 0 0 (w.upostag.find(tag_separators[separator]) != string::npos || w.xpostag.find(tag_separators[separator]) != string::npos))
22048 0 0 if (separator >= tag_separators.size())
22054 0 0 if (xpostag || feats) {
22056 0 0 if (xpostag) combined_tag.append(w.xpostag);
22057 0 0 if (feats) combined_tag.push_back(tag_separators[separator]);
22058 0 0 if (feats) combined_tag.append(w.feats);
22067 0 0 for (auto&& sentence : data)
22068 0 0 for (size_t i = 1; i < sentence.words.size(); i++)
22069 0 0 if (sentence.words[i].upostag == upostag)
22070 0 0 counts[combine_tag(sentence.words[i], xpostag, feats, combined_tag)]++;
22072 0 0 combined_tag.assign("~").append(upostag);
22074 0 0 for (auto&& tags : counts)
22075 0 0 if (tags.second > best) {
22088 0 0 if (flat_lemmas.count(w.lemma) || flat_lemmas.count(combined_lemma))
22092 0 0 if (w.lemma == "")
22094 0 0 else if (w.lemma == "_")
22098 0 0 if (flat_lemmas.count(w.lemma) || flat_lemmas.count(combined_lemma)) {
22100 0 0 model_normalize_form(w.form, normalized_form);
22101 0 0 return combined_lemma.insert(0, "~").append("~").append(normalized_form);
0 0 return combined_lemma.insert(0, "~").append("~").append(normalized_form);
22111 0 0 if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model);
0 0 if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model);
0 0 if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model);
22118 0 0 if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model);
0 0 if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model);
0 0 if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model);
22120 0 0 if (options.count(indexed_name))
22121 0 0 return parse_int(options.at(indexed_name), name.c_str(), value, error);
22122 0 0 if (options.count(name))
22123 0 0 return parse_int(options.at(name), name.c_str(), value, error);
22129 0 0 if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model);
0 0 if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model);
0 0 if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model);
22131 0 0 if (options.count(indexed_name) || options.count(name)) {
22133 0 0 if (!parse_int(options.count(indexed_name) ? options.at(indexed_name) : options.at(name), name.c_str(), int_value, error))
0 0 if (!parse_int(options.count(indexed_name) ? options.at(indexed_name) : options.at(name), name.c_str(), int_value, error))
0 0 if (!parse_int(options.count(indexed_name) ? options.at(indexed_name) : options.at(name), name.c_str(), int_value, error))
22142 0 0 if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model);
0 0 if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model);
0 0 if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model);
22144 0 0 if (options.count(indexed_name))
22145 0 0 return parse_double(options.at(indexed_name), name.c_str(), value, error);
22146 0 0 if (options.count(name))
22147 0 0 return parse_double(options.at(name), name.c_str(), value, error);
22307 0 0 training_error::training_error() : runtime_error(message_collector.str()) {
22580 0 0 decompose(str, true);
0 0 decompose(str, true);
0 0 decompose(str, true);
22585 0 0 for (old = 0, com = 0; old < str.size(); old++, com++) {
22587 0 0 if (str[old] >= Hangul::LBase && str[old] < Hangul::LBase + Hangul::LCount) {
0 0 if (str[old] >= Hangul::LBase && str[old] < Hangul::LBase + Hangul::LCount) {
0 0 if (str[old] >= Hangul::LBase && str[old] < Hangul::LBase + Hangul::LCount) {
22589 0 0 if (old + 1 < str.size() && str[old + 1] >= Hangul::VBase && str[old + 1] < Hangul::VBase + Hangul::VCount) {
0 0 if (old + 1 < str.size() && str[old + 1] >= Hangul::VBase && str[old + 1] < Hangul::VBase + Hangul::VCount) {
0 0 if (old + 1 < str.size() && str[old + 1] >= Hangul::VBase && str[old + 1] < Hangul::VBase + Hangul::VCount) {
0 0 if (old + 1 < str.size() && str[old + 1] >= Hangul::VBase && str[old + 1] < Hangul::VBase + Hangul::VCount) {
22592 0 0 if (old + 1 < str.size() && str[old + 1] > Hangul::TBase && str[old + 1] < Hangul::TBase + Hangul::TCount)
0 0 if (old + 1 < str.size() && str[old + 1] > Hangul::TBase && str[old + 1] < Hangul::TBase + Hangul::TCount)
0 0 if (old + 1 < str.size() && str[old + 1] > Hangul::TBase && str[old + 1] < Hangul::TBase + Hangul::TCount)
0 0 if (old + 1 < str.size() && str[old + 1] > Hangul::TBase && str[old + 1] < Hangul::TBase + Hangul::TCount)
22595 0 0 } else if (str[old] >= Hangul::SBase && str[old] < Hangul::SBase + Hangul::SCount) {
0 0 } else if (str[old] >= Hangul::SBase && str[old] < Hangul::SBase + Hangul::SCount) {
0 0 } else if (str[old] >= Hangul::SBase && str[old] < Hangul::SBase + Hangul::SCount) {
22597 0 0 if ((str[old] - Hangul::SBase) % Hangul::TCount && old + 1 < str.size() && str[old + 1] > Hangul::TBase && str[old + 1] < Hangul::TBase + Hangul::TCount)
0 0 if ((str[old] - Hangul::SBase) % Hangul::TCount && old + 1 < str.size() && str[old + 1] > Hangul::TBase && str[old + 1] < Hangul::TBase + Hangul::TCount)
0 0 if ((str[old] - Hangul::SBase) % Hangul::TCount && old + 1 < str.size() && str[old + 1] > Hangul::TBase && str[old + 1] < Hangul::TBase + Hangul::TCount)
0 0 if ((str[old] - Hangul::SBase) % Hangul::TCount && old + 1 < str.size() && str[old + 1] > Hangul::TBase && str[old + 1] < Hangul::TBase + Hangul::TCount)
0 0 if ((str[old] - Hangul::SBase) % Hangul::TCount && old + 1 < str.size() && str[old + 1] > Hangul::TBase && str[old + 1] < Hangul::TBase + Hangul::TCount)
22599 0 0 } else if (str[old] < CHARS) {
22603 0 0 for (int last_ccc = -1; old + 1 < str.size(); old++) {
22604 0 0 int ccc = str[old + 1] < CHARS ? ccc_block[ccc_index[str[old + 1] >> 8]][str[old + 1] & 0xFF] : 0;
22605 0 0 if (composition[1] - composition[0] && last_ccc < ccc) {
0 0 if (composition[1] - composition[0] && last_ccc < ccc) {
22608 0 0 while (l + 2 < r) {
22610 0 0 if (composition_data[m] <= str[old + 1]) l = m;
22611 0 0 if (composition_data[m] >= str[old + 1]) r = m;
22613 0 0 if (composition_data[l] == str[old + 1]) {
22621 0 0 if (!ccc) break;
22628 0 0 if (com < old) str.resize(com);
22635 0 0 for (auto&& chr : str) {
22638 0 0 if (chr >= Hangul::SBase && chr < Hangul::SBase + Hangul::SCount) {
22640 0 0 decomposition_len = 2 + ((chr - Hangul::SBase) % Hangul::TCount ? 1 : 0);
22641 0 0 } else if (chr < CHARS) {
22645 0 0 if (decomposition_len && !kompatibility && (decomposition[0] & 1)) decomposition_len = 0;
0 0 if (decomposition_len && !kompatibility && (decomposition[0] & 1)) decomposition_len = 0;
22646 0 0 if (decomposition_len && kompatibility && (decomposition[0] & 2))
0 0 if (decomposition_len && kompatibility && (decomposition[0] & 2))
22648 0 0 for (auto i = decomposition[0] >> 2; i < decomposition[1] >> 2; i++) {
22650 0 0 if (further_decomposition[0] & 1) decomposition_len += (further_decomposition[1] >> 2) - (further_decomposition[0] >> 2) - 1;
22654 0 0 if (!decomposition_len) continue;
22660 0 0 if (any_decomposition) {
22662 0 0 for (size_t dec = str.size(), old = dec - additional; old--; )
22663 0 0 if (str[old] >= Hangul::SBase && str[old] < Hangul::SBase + Hangul::SCount) {
0 0 if (str[old] >= Hangul::SBase && str[old] < Hangul::SBase + Hangul::SCount) {
0 0 if (str[old] >= Hangul::SBase && str[old] < Hangul::SBase + Hangul::SCount) {
22666 0 0 if (s_index % Hangul::TCount) str[--dec] = Hangul::TBase + s_index % Hangul::TCount;
22669 0 0 } else if (str[old] < CHARS) {
22673 0 0 if (decomposition_len && !kompatibility && (decomposition[0] & 1)) decomposition_len = 0;
0 0 if (decomposition_len && !kompatibility && (decomposition[0] & 1)) decomposition_len = 0;
22674 0 0 if (decomposition_len && kompatibility && (decomposition[0] & 2)) {
0 0 if (decomposition_len && kompatibility && (decomposition[0] & 2)) {
22676 0 0 while (decomposition_len--) {
22679 0 0 if (further_decomposition[0] & 1) {
22680 0 0 for (int further_decomposition_len = (further_decomposition[1] >> 2) - (further_decomposition[0] >> 2); further_decomposition_len--; )
22686 0 0 } else if (decomposition_len) {
22688 0 0 while (decomposition_len--)
22701 0 0 for (size_t i = 1; i < str.size(); i++) {
22702 0 0 unsigned ccc = str[i] < CHARS ? ccc_block[ccc_index[str[i] >> 8]][str[i] & 0xFF] : 0;
22703 0 0 if (!ccc) continue;
22707 0 0 for (j = i; j && (str[j-1] < CHARS ? ccc_block[ccc_index[str[j-1] >> 8]][str[j-1] & 0xFF] : 0) > ccc; j--) str[j] = str[j-1];
0 0 for (j = i; j && (str[j-1] < CHARS ? ccc_block[ccc_index[str[j-1] >> 8]][str[j-1] & 0xFF] : 0) > ccc; j--) str[j] = str[j-1];
0 0 for (j = i; j && (str[j-1] < CHARS ? ccc_block[ccc_index[str[j-1] >> 8]][str[j-1] & 0xFF] : 0) > ccc; j--) str[j] = str[j-1];
0 0 for (j = i; j && (str[j-1] < CHARS ? ccc_block[ccc_index[str[j-1] >> 8]][str[j-1] & 0xFF] : 0) > ccc; j--) str[j] = str[j-1];
22963 0 0 for (; *str; str++)
22964 0 0 if (((unsigned char)*str) >= 0x80) {
22965 0 0 if (((unsigned char)*str) < 0xC0) return false;
22966 0 0 else if (((unsigned char)*str) < 0xE0) {
22967 0 0 str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
0 0 str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
22968 0 0 } else if (((unsigned char)*str) < 0xF0) {
22969 0 0 str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
0 0 str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
22970 0 0 str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
0 0 str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
22971 0 0 } else if (((unsigned char)*str) < 0xF8) {
22972 0 0 str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
0 0 str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
22973 0 0 str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
0 0 str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
22974 0 0 str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
0 0 str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
22981 0 0 for (; len > 0; str++, len--)
22982 0 0 if (((unsigned char)*str) >= 0x80) {
22983 0 0 if (((unsigned char)*str) < 0xC0) return false;
22984 0 0 else if (((unsigned char)*str) < 0xE0) {
22985 0 0 str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
0 0 str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
0 0 str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
0 0 str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
22986 0 0 } else if (((unsigned char)*str) < 0xF0) {
22987 0 0 str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
0 0 str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
0 0 str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
0 0 str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
22988 0 0 str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
0 0 str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
0 0 str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
0 0 str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
22989 0 0 } else if (((unsigned char)*str) < 0xF8) {
22990 0 0 str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
0 0 str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
0 0 str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
0 0 str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
22991 0 0 str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
0 0 str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
0 0 str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
0 0 str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
22992 0 0 str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
0 0 str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
0 0 str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
0 0 str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false;
23001 0 0 for (char32_t chr; (chr = decode(str)); )
23008 0 0 while (len)
23015 0 0 for (auto&& chr : str)
23043 0 0 return {3, 3, 0, ""};
0 0 return {3, 3, 0, ""};
0 0 return {3, 3, 0, ""};
23573 3092 104350 IF_BIT_0(prob)
23097 84345 IF_BIT_0(prob)
23578 23091 6 if (checkDicSize != 0 || processedPos != 0)
23580 0 23091 (dic[(dicPos == 0 ? dicBufSize : dicPos) - 1] >> (8 - lc))));
23582 21934 1163 if (state < kNumLitStates)
23586 18121 157351 do { GET_BIT(prob + symbol, symbol) } while (symbol < 0x100);
81155 94317 do { GET_BIT(prob + symbol, symbol) } while (symbol < 0x100);
153538 21934 do { GET_BIT(prob + symbol, symbol) } while (symbol < 0x100);
23590 0 1163 unsigned matchByte = p->dic[(dicPos - rep0) + ((dicPos < rep0) ? dicBufSize : 0)];
23592 166 997 state -= (state < 10) ? 3 : 6;
23601 1028 8276 GET_BIT2(probLit, symbol, offs &= ~bit, offs &= bit)
5355 3949 GET_BIT2(probLit, symbol, offs &= ~bit, offs &= bit)
23603 8141 1163 while (symbol < 0x100);
23613 572 83773 IF_BIT_0(prob)
487 83858 IF_BIT_0(prob)
23622 83858 0 if (checkDicSize == 0 && processedPos == 0)
23625 280 83578 IF_BIT_0(prob)
83695 163 IF_BIT_0(prob)
23629 283 83412 IF_BIT_0(prob)
645 83050 IF_BIT_0(prob)
23632 0 645 dic[dicPos] = dic[(dicPos - rep0) + ((dicPos < rep0) ? dicBufSize : 0)];
23635 3 642 state = state < kNumLitStates ? 9 : 11;
23645 49 114 IF_BIT_0(prob)
97 66 IF_BIT_0(prob)
23654 9 57 IF_BIT_0(prob)
37 29 IF_BIT_0(prob)
23670 82945 268 state = state < kNumLitStates ? 8 : 11;
23676 462 83238 IF_BIT_0(probLen)
445 83255 IF_BIT_0(probLen)
23687 274 82981 IF_BIT_0(probLen)
113 83142 IF_BIT_0(probLen)
23702 2505 664305 TREE_DECODE(probLen, limit, len);
2240 664570 TREE_DECODE(probLen, limit, len);
583110 83700 TREE_DECODE(probLen, limit, len);
23706 487 83213 if (state >= kNumStates)
23711 55 432 TREE_6_DECODE(prob, distance);
325 162 TREE_6_DECODE(prob, distance);
55 432 TREE_6_DECODE(prob, distance);
429 58 TREE_6_DECODE(prob, distance);
48 439 TREE_6_DECODE(prob, distance);
303 184 TREE_6_DECODE(prob, distance);
65 422 TREE_6_DECODE(prob, distance);
254 233 TREE_6_DECODE(prob, distance);
58 429 TREE_6_DECODE(prob, distance);
260 227 TREE_6_DECODE(prob, distance);
56 431 TREE_6_DECODE(prob, distance);
272 215 TREE_6_DECODE(prob, distance);
23712 405 82 if (distance >= kStartPosModelIndex)
23717 166 239 if (posSlot < kEndPosModelIndex)
23724 325 166 do
23726 54 437 GET_BIT2(prob + i, i, ; , distance |= mask);
239 252 GET_BIT2(prob + i, i, ; , distance |= mask);
23735 2333 239 do
23737 309 2263 NORMALIZE
23761 29 210 GET_BIT2(prob + i, i, ; , distance |= 1);
124 115 GET_BIT2(prob + i, i, ; , distance |= 1);
23762 37 202 GET_BIT2(prob + i, i, ; , distance |= 2);
118 121 GET_BIT2(prob + i, i, ; , distance |= 2);
23763 32 207 GET_BIT2(prob + i, i, ; , distance |= 4);
130 109 GET_BIT2(prob + i, i, ; , distance |= 4);
23764 26 213 GET_BIT2(prob + i, i, ; , distance |= 8);
126 113 GET_BIT2(prob + i, i, ; , distance |= 8);
23766 0 239 if (distance == (uint32_t)0xFFFFFFFF)
23778 487 0 if (checkDicSize == 0)
23780 487 0 if (distance >= processedPos)
23783 0 0 else if (distance >= checkDicSize)
23785 229 258 state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3;
23790 83700 0 if (limit == dicPos)
23794 0 83700 unsigned curLen = ((rem < len) ? (unsigned)rem : len);
23795 0 83700 size_t pos = (dicPos - rep0) + ((dicPos < rep0) ? dicBufSize : 0);
23800 83700 0 if (pos + curLen <= dicBufSize)
23806 22566528 83700 do
23812 0 0 do
23815 0 0 if (++pos == dicBufSize)
23823 106938 504 while (dicPos < limit && buf < bufLimit);
23824 9 495 NORMALIZE;
23842 0 510 if (p->remainLen != 0 && p->remainLen < kMatchSpecLenStart)
23849 0 0 if (limit - dicPos < len)
23852 0 0 if (p->checkDicSize == 0 && p->prop.dicSize - p->processedPos <= len)
0 0 if (p->checkDicSize == 0 && p->prop.dicSize - p->processedPos <= len)
23857 0 0 while (len-- != 0)
23859 0 0 dic[dicPos] = dic[(dicPos - rep0) + ((dicPos < rep0) ? dicBufSize : 0)];
23871 504 0 if (p->checkDicSize == 0)
23874 0 504 if (limit - p->dicPos > rem)
23877 504 0 RINOK(LzmaDec_DecodeReal(p, limit2, bufLimit));
23878 0 504 if (p->processedPos >= p->prop.dicSize)
23882 498 6 while (p->dicPos < limit && p->buf < bufLimit && p->remainLen < kMatchSpecLenStart);
0 498 while (p->dicPos < limit && p->buf < bufLimit && p->remainLen < kMatchSpecLenStart);
0 0 while (p->dicPos < limit && p->buf < bufLimit && p->remainLen < kMatchSpecLenStart);
23884 0 504 if (p->remainLen > kMatchSpecLenStart)
23915 0 470 IF_BIT_0_CHECK(prob)
0 0 IF_BIT_0_CHECK(prob)
37 433 IF_BIT_0_CHECK(prob)
23922 36 1 if (p->checkDicSize != 0 || p->processedPos != 0)
23925 0 36 (p->dic[(p->dicPos == 0 ? p->dicBufSize : p->dicPos) - 1] >> (8 - p->prop.lc))));
23927 27 10 if (state < kNumLitStates)
23930 23 193 do { GET_BIT_CHECK(prob + symbol, symbol) } while (symbol < 0x100);
23 0 do { GET_BIT_CHECK(prob + symbol, symbol) } while (symbol < 0x100);
111 105 do { GET_BIT_CHECK(prob + symbol, symbol) } while (symbol < 0x100);
189 27 do { GET_BIT_CHECK(prob + symbol, symbol) } while (symbol < 0x100);
23935 0 10 ((p->dicPos < p->reps[0]) ? p->dicBufSize : 0)];
23945 10 70 GET_BIT2_CHECK(probLit, symbol, offs &= ~bit, offs &= bit)
10 0 GET_BIT2_CHECK(probLit, symbol, offs &= ~bit, offs &= bit)
56 24 GET_BIT2_CHECK(probLit, symbol, offs &= ~bit, offs &= bit)
23947 70 10 while (symbol < 0x100);
23957 2 431 IF_BIT_0_CHECK(prob)
2 0 IF_BIT_0_CHECK(prob)
17 416 IF_BIT_0_CHECK(prob)
23969 0 416 IF_BIT_0_CHECK(prob)
0 0 IF_BIT_0_CHECK(prob)
415 1 IF_BIT_0_CHECK(prob)
23973 1 414 IF_BIT_0_CHECK(prob)
1 0 IF_BIT_0_CHECK(prob)
0 415 IF_BIT_0_CHECK(prob)
23976 0 0 NORMALIZE_CHECK;
0 0 NORMALIZE_CHECK;
23988 0 1 IF_BIT_0_CHECK(prob)
0 0 IF_BIT_0_CHECK(prob)
1 0 IF_BIT_0_CHECK(prob)
23996 0 1 IF_BIT_0_CHECK(prob)
0 0 IF_BIT_0_CHECK(prob)
0 1 IF_BIT_0_CHECK(prob)
24012 4 429 IF_BIT_0_CHECK(probLen)
4 0 IF_BIT_0_CHECK(probLen)
16 417 IF_BIT_0_CHECK(probLen)
24023 2 415 IF_BIT_0_CHECK(probLen)
2 0 IF_BIT_0_CHECK(probLen)
2 415 IF_BIT_0_CHECK(probLen)
24038 21 3353 TREE_DECODE_CHECK(probLen, limit, len);
21 0 TREE_DECODE_CHECK(probLen, limit, len);
56 3318 TREE_DECODE_CHECK(probLen, limit, len);
2941 433 TREE_DECODE_CHECK(probLen, limit, len);
24042 17 416 if (state < 4)
24048 12 90 TREE_DECODE_CHECK(prob, 1 << kNumPosSlotBits, posSlot);
12 0 TREE_DECODE_CHECK(prob, 1 << kNumPosSlotBits, posSlot);
62 40 TREE_DECODE_CHECK(prob, 1 << kNumPosSlotBits, posSlot);
85 17 TREE_DECODE_CHECK(prob, 1 << kNumPosSlotBits, posSlot);
24049 14 3 if (posSlot >= kStartPosModelIndex)
24055 9 5 if (posSlot < kEndPosModelIndex)
24062 37 5 do
24064 5 37 NORMALIZE_CHECK
5 0 NORMALIZE_CHECK
24075 33 14 do
24077 6 41 GET_BIT_CHECK(prob + i, i);
6 0 GET_BIT_CHECK(prob + i, i);
27 20 GET_BIT_CHECK(prob + i, i);
24085 9 461 NORMALIZE_CHECK;
9 0 NORMALIZE_CHECK;
24102 0 0 if (initDic)
24108 0 0 if (initState)
24123 47940 6 for (i = 0; i < numProbs; i++)
24139 510 0 while (p->remainLen != kMatchSpecLenStart)
24143 6 504 if (p->needFlush != 0)
24145 36 0 for (; inSize > 0 && p->tempBufSize < RC_INIT_SIZE; (*srcLen)++, inSize--)
30 6 for (; inSize > 0 && p->tempBufSize < RC_INIT_SIZE; (*srcLen)++, inSize--)
24147 0 6 if (p->tempBufSize < RC_INIT_SIZE)
24152 6 0 if (p->tempBuf[0] != 0)
24160 6 504 if (p->dicPos >= dicLimit)
24162 6 0 if (p->remainLen == 0 && p->code == 0)
6 0 if (p->remainLen == 0 && p->code == 0)
24167 0 0 if (finishMode == LZMA_FINISH_ANY)
24172 0 0 if (p->remainLen != 0)
24180 6 498 if (p->needInitState)
24183 0 504 if (p->tempBufSize == 0)
24187 470 34 if (inSize < LZMA_REQUIRED_INPUT_MAX || checkEndMarkNow)
24190 0 470 if (dummyRes == DUMMY_ERROR)
24198 0 470 if (checkEndMarkNow && dummyRes != DUMMY_MATCH)
24208 504 0 if (LzmaDec_DecodeReal2(p, dicLimit, bufLimit) != 0)
24218 0 0 while (rem < LZMA_REQUIRED_INPUT_MAX && lookAhead < inSize)
0 0 while (rem < LZMA_REQUIRED_INPUT_MAX && lookAhead < inSize)
24221 0 0 if (rem < LZMA_REQUIRED_INPUT_MAX || checkEndMarkNow)
24224 0 0 if (dummyRes == DUMMY_ERROR)
24230 0 0 if (checkEndMarkNow && dummyRes != DUMMY_MATCH)
24237 0 0 if (LzmaDec_DecodeReal2(p, dicLimit, p->buf) != 0)
24246 0 0 if (p->code == 0)
24261 0 0 if (p->dicPos == p->dicBufSize)
24264 0 0 if (outSize > p->dicBufSize - dicPos)
24284 0 0 if (res != 0)
24286 0 0 if (outSizeCur == 0 || outSize == 0)
24314 6 0 if (size < LZMA_PROPS_SIZE)
24319 0 6 if (dicSize < LZMA_DIC_MIN)
24324 6 0 if (d >= (9 * 5 * 5))
24338 0 6 if (p->probs == 0 || numProbs != p->numProbs)
0 0 if (p->probs == 0 || numProbs != p->numProbs)
24343 6 0 if (p->probs == 0)
24352 6 0 RINOK(LzmaProps_Decode(&propNew, props, propsSize));
24353 6 0 RINOK(LzmaDec_AllocateProbs2(p, &propNew, alloc));
24362 0 0 RINOK(LzmaProps_Decode(&propNew, props, propsSize));
24363 0 0 RINOK(LzmaDec_AllocateProbs2(p, &propNew, alloc));
24365 0 0 if (p->dic == 0 || dicBufSize != p->dicBufSize)
0 0 if (p->dic == 0 || dicBufSize != p->dicBufSize)
24369 0 0 if (p->dic == 0)
24389 6 0 if (inSize < RC_INIT_SIZE)
24394 6 0 if (res != 0)
24404 6 0 if (res == SZ_OK && *status == LZMA_STATUS_NEEDS_MORE_INPUT)
0 6 if (res == SZ_OK && *status == LZMA_STATUS_NEEDS_MORE_INPUT)
24418 6 6 static void LzmaFree(void* /*p*/, void *address) { delete[] (char*) address; }
24426 6 0 if (!is.read((char *) &uncompressed_len, sizeof(uncompressed_len))) return false;
24427 6 0 if (!is.read((char *) &compressed_len, sizeof(compressed_len))) return false;
24428 6 0 if (!is.read((char *) &poor_crc, sizeof(poor_crc))) return false;
24429 6 0 if (poor_crc != uncompressed_len * 19991 + compressed_len * 199999991 + 1234567890) return false;
24430 6 0 if (!is.read((char *) props_encoded, sizeof(props_encoded))) return false;
24433 6 0 if (!is.read((char *) compressed.data(), compressed_len)) return false;
6 0 if (!is.read((char *) compressed.data(), compressed_len)) return false;
24437 6 0 auto res = lzma::LzmaDecode(data.fill(uncompressed_len), &uncompressed_size, compressed.data(), &compressed_size, props_encoded, LZMA_PROPS_SIZE, lzma::LZMA_FINISH_ANY, &status, &lzmaAllocator);
24438 6 0 if (res != SZ_OK || uncompressed_size != uncompressed_len || compressed_size != compressed_len) return false;
6 0 if (res != SZ_OK || uncompressed_size != uncompressed_len || compressed_size != compressed_len) return false;
6 0 if (res != SZ_OK || uncompressed_size != uncompressed_len || compressed_size != compressed_len) return false;
24768 0 0 if (!p->directInput)
24780 0 0 if (p->directInput)
24785 0 0 if (p->bufferBase == 0 || p->blockSize != blockSize)
0 0 if (p->bufferBase == 0 || p->blockSize != blockSize)
24808 0 0 if (p->streamEndWasReached || p->result != SZ_OK)
0 0 if (p->streamEndWasReached || p->result != SZ_OK)
24810 0 0 if (p->directInput)
24813 0 0 if (curSize > p->directInputRem)
24817 0 0 if (p->directInputRem == 0)
24825 0 0 if (size == 0)
24828 0 0 if (p->result != SZ_OK)
24830 0 0 if (size == 0)
24836 0 0 if (p->streamPos - p->pos > p->keepSizeAfter)
24851 0 0 if (p->directInput)
0 0 if (p->directInput)
24859 0 0 if (p->streamEndWasReached)
24861 0 0 if (p->keepSizeAfter >= p->streamPos - p->pos)
24867 0 0 if (MatchFinder_NeedMove(p))
24890 0 0 for (i = 0; i < 256; i++)
0 0 for (i = 0; i < 256; i++)
24894 0 0 for (j = 0; j < 8; j++)
0 0 for (j = 0; j < 8; j++)
24915 0 0 if (sizeInBytes / sizeof(CLzRef) != num)
24925 0 0 if (historySize > kMaxHistorySize)
24931 0 0 if (historySize > ((uint32_t)2 << 30))
24938 0 0 if (LzInWindow_Create(p, sizeReserv, alloc))
24945 0 0 if (p->numHashBytes == 2)
24956 0 0 if (hs > (1 << 24))
24958 0 0 if (p->numHashBytes == 3)
24966 0 0 if (p->numHashBytes > 2) p->fixedHashSize += kHash2Size;
24967 0 0 if (p->numHashBytes > 3) p->fixedHashSize += kHash3Size;
24968 0 0 if (p->numHashBytes > 4) p->fixedHashSize += kHash4Size;
24978 0 0 p->numSons = (p->btMode ? newCyclicBufferSize * 2 : newCyclicBufferSize);
24980 0 0 if (p->hash != 0 && prevSize == newSize)
0 0 if (p->hash != 0 && prevSize == newSize)
24984 0 0 if (p->hash != 0)
24999 0 0 if (limit2 < limit)
25002 0 0 if (limit2 <= p->keepSizeAfter)
25004 0 0 if (limit2 > 0)
25009 0 0 if (limit2 < limit)
25013 0 0 if (lenLimit > p->matchMaxLen)
25023 0 0 for (i = 0; i < p->hashSizeSum; i++)
25042 0 0 for (i = 0; i < numItems; i++)
0 0 for (i = 0; i < numItems; i++)
25045 0 0 if (value <= subValue)
0 0 if (value <= subValue)
25062 0 0 if (p->pos == kMaxValForNormalize)
25064 0 0 if (!p->streamEndWasReached && p->keepSizeAfter == p->streamPos - p->pos)
0 0 if (!p->streamEndWasReached && p->keepSizeAfter == p->streamPos - p->pos)
25066 0 0 if (p->cyclicBufferPos == p->cyclicBufferSize)
25079 0 0 if (cutValue-- == 0 || delta >= _cyclicBufferSize)
0 0 if (cutValue-- == 0 || delta >= _cyclicBufferSize)
0 0 if (cutValue-- == 0 || delta >= _cyclicBufferSize)
25083 0 0 curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)];
25084 0 0 if (pb[maxLen] == cur[maxLen] && *pb == *cur)
0 0 if (pb[maxLen] == cur[maxLen] && *pb == *cur)
25087 0 0 while (++len != lenLimit)
25088 0 0 if (pb[len] != cur[len])
25090 0 0 if (maxLen < len)
25094 0 0 if (len == lenLimit)
25112 0 0 if (cutValue-- == 0 || delta >= _cyclicBufferSize)
0 0 if (cutValue-- == 0 || delta >= _cyclicBufferSize)
0 0 if (cutValue-- == 0 || delta >= _cyclicBufferSize)
25118 0 0 CLzRef *pair = son + ((_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1);
25120 0 0 uint32_t len = (len0 < len1 ? len0 : len1);
25121 0 0 if (pb[len] == cur[len])
25123 0 0 if (++len != lenLimit && pb[len] == cur[len])
0 0 if (++len != lenLimit && pb[len] == cur[len])
0 0 if (++len != lenLimit && pb[len] == cur[len])
25124 0 0 while (++len != lenLimit)
25125 0 0 if (pb[len] != cur[len])
25127 0 0 if (maxLen < len)
25131 0 0 if (len == lenLimit)
25139 0 0 if (pb[len] < cur[len])
25166 0 0 if (cutValue-- == 0 || delta >= _cyclicBufferSize)
0 0 if (cutValue-- == 0 || delta >= _cyclicBufferSize)
0 0 if (cutValue-- == 0 || delta >= _cyclicBufferSize)
25172 0 0 CLzRef *pair = son + ((_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1);
25174 0 0 uint32_t len = (len0 < len1 ? len0 : len1);
25175 0 0 if (pb[len] == cur[len])
25177 0 0 while (++len != lenLimit)
25178 0 0 if (pb[len] != cur[len])
25181 0 0 if (len == lenLimit)
25189 0 0 if (pb[len] < cur[len])
25214 0 0 static void MatchFinder_MovePos(CMatchFinder *p) { MOVE_POS; }
25236 0 0 GET_MATCHES_HEADER(2)
25241 0 0 GET_MATCHES_FOOTER(offset, 1)
25247 0 0 GET_MATCHES_HEADER(3)
25252 0 0 GET_MATCHES_FOOTER(offset, 2)
25258 0 0 GET_MATCHES_HEADER(3)
25270 0 0 if (delta2 < p->cyclicBufferSize && *(cur - delta2) == *cur)
0 0 if (delta2 < p->cyclicBufferSize && *(cur - delta2) == *cur)
25272 0 0 for (; maxLen != lenLimit; maxLen++)
25273 0 0 if (cur[(ptrdiff_t)maxLen - delta2] != cur[maxLen])
25278 0 0 if (maxLen == lenLimit)
25281 0 0 MOVE_POS_RET;
25284 0 0 GET_MATCHES_FOOTER(offset, maxLen)
25290 0 0 GET_MATCHES_HEADER(4)
25304 0 0 if (delta2 < p->cyclicBufferSize && *(cur - delta2) == *cur)
0 0 if (delta2 < p->cyclicBufferSize && *(cur - delta2) == *cur)
25310 0 0 if (delta2 != delta3 && delta3 < p->cyclicBufferSize && *(cur - delta3) == *cur)
0 0 if (delta2 != delta3 && delta3 < p->cyclicBufferSize && *(cur - delta3) == *cur)
0 0 if (delta2 != delta3 && delta3 < p->cyclicBufferSize && *(cur - delta3) == *cur)
25317 0 0 if (offset != 0)
25319 0 0 for (; maxLen != lenLimit; maxLen++)
25320 0 0 if (cur[(ptrdiff_t)maxLen - delta2] != cur[maxLen])
25323 0 0 if (maxLen == lenLimit)
25326 0 0 MOVE_POS_RET;
25329 0 0 if (maxLen < 3)
25331 0 0 GET_MATCHES_FOOTER(offset, maxLen)
25337 0 0 GET_MATCHES_HEADER(4)
25351 0 0 if (delta2 < p->cyclicBufferSize && *(cur - delta2) == *cur)
0 0 if (delta2 < p->cyclicBufferSize && *(cur - delta2) == *cur)
25357 0 0 if (delta2 != delta3 && delta3 < p->cyclicBufferSize && *(cur - delta3) == *cur)
0 0 if (delta2 != delta3 && delta3 < p->cyclicBufferSize && *(cur - delta3) == *cur)
0 0 if (delta2 != delta3 && delta3 < p->cyclicBufferSize && *(cur - delta3) == *cur)
25364 0 0 if (offset != 0)
25366 0 0 for (; maxLen != lenLimit; maxLen++)
25367 0 0 if (cur[(ptrdiff_t)maxLen - delta2] != cur[maxLen])
25370 0 0 if (maxLen == lenLimit)
25373 0 0 MOVE_POS_RET;
25376 0 0 if (maxLen < 3)
25380 0 0 MOVE_POS_RET
25386 0 0 GET_MATCHES_HEADER(3)
25392 0 0 MOVE_POS_RET
25397 0 0 do
25399 0 0 SKIP_HEADER(2)
25403 0 0 SKIP_FOOTER
25410 0 0 do
25412 0 0 SKIP_HEADER(3)
25416 0 0 SKIP_FOOTER
25423 0 0 do
25426 0 0 SKIP_HEADER(3)
25431 0 0 SKIP_FOOTER
25438 0 0 do
25441 0 0 SKIP_HEADER(4)
25447 0 0 SKIP_FOOTER
25454 0 0 do
25457 0 0 SKIP_HEADER(4)
25464 0 0 MOVE_POS
25471 0 0 do
25473 0 0 SKIP_HEADER(3)
25478 0 0 MOVE_POS
25489 0 0 if (!p->btMode)
0 0 if (!p->btMode)
25494 0 0 else if (p->numHashBytes == 2)
0 0 else if (p->numHashBytes == 2)
25499 0 0 else if (p->numHashBytes == 3)
0 0 else if (p->numHashBytes == 3)
25611 0 0 if (level < 0) level = 5;
25613 0 0 if (p->dictSize == 0) p->dictSize = (level <= 5 ? (1 << (level * 2 + 14)) : (level == 6 ? (1 << 25) : (1 << 26)));
0 0 if (p->dictSize == 0) p->dictSize = (level <= 5 ? (1 << (level * 2 + 14)) : (level == 6 ? (1 << 25) : (1 << 26)));
0 0 if (p->dictSize == 0) p->dictSize = (level <= 5 ? (1 << (level * 2 + 14)) : (level == 6 ? (1 << 25) : (1 << 26)));
25614 0 0 if (p->lc < 0) p->lc = 3;
25615 0 0 if (p->lp < 0) p->lp = 0;
25616 0 0 if (p->pb < 0) p->pb = 2;
25617 0 0 if (p->algo < 0) p->algo = (level < 5 ? 0 : 1);
25618 0 0 if (p->fb < 0) p->fb = (level < 7 ? 32 : 64);
0 0 if (p->fb < 0) p->fb = (level < 7 ? 32 : 64);
25619 0 0 if (p->btMode < 0) p->btMode = (p->algo == 0 ? 0 : 1);
25620 0 0 if (p->numHashBytes < 0) p->numHashBytes = 4;
25621 0 0 if (p->mc == 0) p->mc = (16 + (p->fb >> 1)) >> (p->btMode ? 0 : 1);
25622 0 0 if (p->numThreads < 0)
25663 0 0 for (slotFast = 2; slotFast < kNumLogBits * 2; slotFast++)
0 0 for (slotFast = 2; slotFast < kNumLogBits * 2; slotFast++)
25667 0 0 for (j = 0; j < k; j++, c++)
0 0 for (j = 0; j < k; j++, c++)
25886 0 0 for (i = 0; i < kNumStates; i++)
25891 0 0 for (i = 0; i < kNumLenToPosStates; i++)
25912 0 0 for (i = 0; i < kNumStates; i++)
25917 0 0 for (i = 0; i < kNumLenToPosStates; i++)
25935 0 0 if (props.lc > LZMA_LC_MAX || props.lp > LZMA_LP_MAX || props.pb > LZMA_PB_MAX ||
0 0 if (props.lc > LZMA_LC_MAX || props.lp > LZMA_LP_MAX || props.pb > LZMA_PB_MAX ||
0 0 if (props.lc > LZMA_LC_MAX || props.lp > LZMA_LP_MAX || props.pb > LZMA_PB_MAX ||
0 0 if (props.lc > LZMA_LC_MAX || props.lp > LZMA_LP_MAX || props.pb > LZMA_PB_MAX ||
25936 0 0 props.dictSize > ((uint32_t)1 << kDicLogSizeMaxCompress) || props.dictSize > ((uint32_t)1 << 30))
25942 0 0 if (fb < 5)
25944 0 0 if (fb > LZMA_MATCH_LEN_MAX)
25955 0 0 if (props.btMode)
25957 0 0 if (props.numHashBytes < 2)
25959 0 0 else if (props.numHashBytes < 4)
25994 0 0 if (p->bufBase == 0)
25997 0 0 if (p->bufBase == 0)
26027 0 0 if (p->res != SZ_OK)
26030 0 0 if (num != p->outStream->Write(p->outStream, p->bufBase, num))
26038 0 0 if ((uint32_t)p->low < (uint32_t)0xFF000000 || (int)(p->low >> 32) != 0)
0 0 if ((uint32_t)p->low < (uint32_t)0xFF000000 || (int)(p->low >> 32) != 0)
26041 0 0 do
26046 0 0 if (buf == p->bufLim)
26060 0 0 for (i = 0; i < 5; i++)
26070 0 0 if (p->range < kTopValue)
26076 0 0 while (numBits != 0);
26083 0 0 if (symbol == 0)
26095 0 0 if (p->range < kTopValue)
26110 0 0 while (symbol < 0x10000);
26124 0 0 while (symbol < 0x10000);
26130 0 0 for (i = (1 << kNumMoveReducingBits) / 2; i < kBitModelTotal; i += (1 << kNumMoveReducingBits))
26136 0 0 for (j = 0; j < kCyclesBits; j++)
26140 0 0 while (w >= ((uint32_t)1 << 16))
26171 0 0 while (symbol < 0x10000);
0 0 while (symbol < 0x10000);
26187 0 0 while (symbol < 0x10000);
26195 0 0 for (i = numBitLevels; i != 0;)
26209 0 0 for (i = 0; i < numBitLevels; i++)
26222 0 0 while (symbol != 1)
0 0 while (symbol != 1)
0 0 while (symbol != 1)
0 0 while (symbol != 1)
26235 0 0 for (i = numBitLevels; i != 0; i--)
0 0 for (i = numBitLevels; i != 0; i--)
26249 0 0 for (i = 0; i < (LZMA_NUM_PB_STATES_MAX << kLenNumLowBits); i++)
0 0 for (i = 0; i < (LZMA_NUM_PB_STATES_MAX << kLenNumLowBits); i++)
26251 0 0 for (i = 0; i < (LZMA_NUM_PB_STATES_MAX << kLenNumMidBits); i++)
0 0 for (i = 0; i < (LZMA_NUM_PB_STATES_MAX << kLenNumMidBits); i++)
26253 0 0 for (i = 0; i < kLenNumHighSymbols; i++)
0 0 for (i = 0; i < kLenNumHighSymbols; i++)
26259 0 0 if (symbol < kLenNumLowSymbols)
26267 0 0 if (symbol < kLenNumLowSymbols + kLenNumMidSymbols)
26287 0 0 for (i = 0; i < kLenNumLowSymbols; i++)
26289 0 0 if (i >= numSymbols)
26293 0 0 for (; i < kLenNumLowSymbols + kLenNumMidSymbols; i++)
26295 0 0 if (i >= numSymbols)
26299 0 0 for (; i < numSymbols; i++)
26312 0 0 for (posState = 0; posState < numPosStates; posState++)
0 0 for (posState = 0; posState < numPosStates; posState++)
26319 0 0 if (updatePrice)
26320 0 0 if (--p->counters[posState] == 0)
26326 0 0 if (num != 0)
0 0 if (num != 0)
0 0 if (num != 0)
0 0 if (num != 0)
0 0 if (num != 0)
0 0 if (num != 0)
26338 0 0 if (numPairs > 0)
26341 0 0 if (lenRes == p->numFastBytes)
26346 0 0 if (numAvail > LZMA_MATCH_LEN_MAX)
26350 0 0 for (; lenRes < numAvail && pby[lenRes] == pby2[lenRes]; lenRes++);
0 0 for (; lenRes < numAvail && pby[lenRes] == pby2[lenRes]; lenRes++);
26373 0 0 if (repIndex == 0)
26381 0 0 if (repIndex == 1)
26405 0 0 if (p->opt[cur].prev1IsChar)
0 0 if (p->opt[cur].prev1IsChar)
26409 0 0 if (p->opt[cur].prev2)
0 0 if (p->opt[cur].prev2)
26428 0 0 while (cur != 0);
0 0 while (cur != 0);
26444 0 0 if (p->optimumEndIndex != p->optimumCurrentIndex)
26454 0 0 if (p->additionalOffset == 0)
26463 0 0 if (numAvail < 2)
26468 0 0 if (numAvail > LZMA_MATCH_LEN_MAX)
26473 0 0 for (i = 0; i < LZMA_NUM_REPS; i++)
26479 0 0 if (data[0] != data2[0] || data[1] != data2[1])
0 0 if (data[0] != data2[0] || data[1] != data2[1])
26484 0 0 for (lenTest = 2; lenTest < numAvail && data[lenTest] == data2[lenTest]; lenTest++);
0 0 for (lenTest = 2; lenTest < numAvail && data[lenTest] == data2[lenTest]; lenTest++);
26486 0 0 if (lenTest > repLens[repMaxIndex])
26489 0 0 if (repLens[repMaxIndex] >= p->numFastBytes)
26499 0 0 if (mainLen >= p->numFastBytes)
26508 0 0 if (mainLen < 2 && curByte != matchByte && repLens[repMaxIndex] < 2)
0 0 if (mainLen < 2 && curByte != matchByte && repLens[repMaxIndex] < 2)
26520 0 0 p->opt[1].price = GET_PRICE_0(p->isMatch[p->state][posState]) +
26531 0 0 if (matchByte == curByte)
26534 0 0 if (shortRepPrice < p->opt[1].price)
26540 0 0 lenEnd = ((mainLen >= repLens[repMaxIndex]) ? mainLen : repLens[repMaxIndex]);
26542 0 0 if (lenEnd < 2)
26549 0 0 for (i = 0; i < LZMA_NUM_REPS; i++)
26555 0 0 while (len >= 2);
26557 0 0 for (i = 0; i < LZMA_NUM_REPS; i++)
26561 0 0 if (repLen < 2)
26564 0 0 do
26568 0 0 if (curAndLenPrice < opt->price)
26581 0 0 len = ((repLens[0] >= 2) ? repLens[0] + 1 : 2);
26582 0 0 if (len <= mainLen)
26585 0 0 while (len > matches[offs])
26593 0 0 uint32_t lenToPosState = GetLenToPosState(len);
26594 0 0 if (distance < kNumFullDistances)
26603 0 0 if (curAndLenPrice < opt->price)
26610 0 0 if (len == matches[offs])
26613 0 0 if (offs == numPairs)
26632 0 0 if (cur == lenEnd)
26636 0 0 if (newLen >= p->numFastBytes)
26645 0 0 if (curOpt->prev1IsChar)
26648 0 0 if (curOpt->prev2)
26651 0 0 if (curOpt->backPrev2 < LZMA_NUM_REPS)
26662 0 0 if (posPrev == cur - 1)
26664 0 0 if (IsShortRep(curOpt))
26673 0 0 if (curOpt->prev1IsChar && curOpt->prev2)
0 0 if (curOpt->prev1IsChar && curOpt->prev2)
26682 0 0 if (pos < LZMA_NUM_REPS)
26688 0 0 if (pos < LZMA_NUM_REPS)
26692 0 0 for (i = 1; i <= pos; i++)
26694 0 0 for (; i < LZMA_NUM_REPS; i++)
26701 0 0 for (i = 1; i < LZMA_NUM_REPS; i++)
26726 0 0 LitEnc_GetPrice(probs, curByte, p->ProbPrices));
26731 0 0 if (curAnd1Price < nextOpt->price)
26742 0 0 if (matchByte == curByte && !(nextOpt->posPrev < cur && nextOpt->backPrev == 0))
0 0 if (matchByte == curByte && !(nextOpt->posPrev < cur && nextOpt->backPrev == 0))
0 0 if (matchByte == curByte && !(nextOpt->posPrev < cur && nextOpt->backPrev == 0))
26745 0 0 if (shortRepPrice <= nextOpt->price)
26756 0 0 if (temp < numAvailFull)
26760 0 0 if (numAvailFull < 2)
26762 0 0 numAvail = (numAvailFull <= p->numFastBytes ? numAvailFull : p->numFastBytes);
26764 0 0 if (!nextIsChar && matchByte != curByte) /* speed optimization */
26771 0 0 if (limit > numAvailFull)
26774 0 0 for (temp = 1; temp < limit && data[temp] == data2[temp]; temp++);
0 0 for (temp = 1; temp < limit && data[temp] == data2[temp]; temp++);
26776 0 0 if (lenTest2 >= 2)
26788 0 0 while (lenEnd < offset)
26792 0 0 if (curAndLenPrice < opt->price)
26807 0 0 for (repIndex = 0; repIndex < LZMA_NUM_REPS; repIndex++)
26813 0 0 if (data[0] != data2[0] || data[1] != data2[1])
0 0 if (data[0] != data2[0] || data[1] != data2[1])
26815 0 0 for (lenTest = 2; lenTest < numAvail && data[lenTest] == data2[lenTest]; lenTest++);
0 0 for (lenTest = 2; lenTest < numAvail && data[lenTest] == data2[lenTest]; lenTest++);
26816 0 0 while (lenEnd < cur + lenTest)
26820 0 0 do
26824 0 0 if (curAndLenPrice < opt->price)
26835 0 0 if (repIndex == 0)
26844 0 0 if (limit > numAvailFull)
26846 0 0 for (; lenTest2 < limit && data[lenTest2] == data2[lenTest2]; lenTest2++);
0 0 for (; lenTest2 < limit && data[lenTest2] == data2[lenTest2]; lenTest2++);
26848 0 0 if (lenTest2 >= 2)
26868 0 0 while (lenEnd < offset)
26872 0 0 if (curAndLenPrice < opt->price)
26888 0 0 if (newLen > numAvail)
26891 0 0 for (numPairs = 0; newLen > matches[numPairs]; numPairs += 2);
26895 0 0 if (newLen >= startLen)
26900 0 0 while (lenEnd < cur + newLen)
26904 0 0 while (startLen > matches[offs])
26911 0 0 uint32_t lenToPosState = GetLenToPosState(lenTest);
26913 0 0 if (curBack < kNumFullDistances)
26919 0 0 if (curAndLenPrice < opt->price)
26927 0 0 if (/*_maxMode && */lenTest == matches[offs])
26934 0 0 if (limit > numAvailFull)
26936 0 0 for (; lenTest2 < limit && data[lenTest2] == data2[lenTest2]; lenTest2++);
0 0 for (; lenTest2 < limit && data[lenTest2] == data2[lenTest2]; lenTest2++);
26938 0 0 if (lenTest2 >= 2)
26957 0 0 while (lenEnd < offset)
26961 0 0 if (curAndLenPrice < opt->price)
26974 0 0 if (offs == numPairs)
26977 0 0 if (curBack >= kNumFullDistances)
26993 0 0 if (p->additionalOffset == 0)
27003 0 0 if (numAvail < 2)
27005 0 0 if (numAvail > LZMA_MATCH_LEN_MAX)
27010 0 0 for (i = 0; i < LZMA_NUM_REPS; i++)
27014 0 0 if (data[0] != data2[0] || data[1] != data2[1])
0 0 if (data[0] != data2[0] || data[1] != data2[1])
27016 0 0 for (len = 2; len < numAvail && data[len] == data2[len]; len++);
0 0 for (len = 2; len < numAvail && data[len] == data2[len]; len++);
27017 0 0 if (len >= p->numFastBytes)
27023 0 0 if (len > repLen)
27031 0 0 if (mainLen >= p->numFastBytes)
27039 0 0 if (mainLen >= 2)
27042 0 0 while (numPairs > 2 && mainLen == matches[numPairs - 4] + 1)
0 0 while (numPairs > 2 && mainLen == matches[numPairs - 4] + 1)
27044 0 0 if (!ChangePair(matches[numPairs - 3], mainDist))
27050 0 0 if (mainLen == 2 && mainDist >= 0x80)
27054 0 0 if (repLen >= 2 && (
0 0 if (repLen >= 2 && (
27055 0 0 (repLen + 1 >= mainLen) ||
27056 0 0 (repLen + 2 >= mainLen && mainDist >= (1 << 9)) ||
0 0 (repLen + 2 >= mainLen && mainDist >= (1 << 9)) ||
27057 0 0 (repLen + 3 >= mainLen && mainDist >= (1 << 15))))
27064 0 0 if (mainLen < 2 || numAvail <= 2)
27068 0 0 if (p->longestMatchLength >= 2)
27071 0 0 if ((p->longestMatchLength >= mainLen && newDistance < mainDist) ||
0 0 if ((p->longestMatchLength >= mainLen && newDistance < mainDist) ||
0 0 if ((p->longestMatchLength >= mainLen && newDistance < mainDist) ||
27072 0 0 (p->longestMatchLength == mainLen + 1 && !ChangePair(mainDist, newDistance)) ||
0 0 (p->longestMatchLength == mainLen + 1 && !ChangePair(mainDist, newDistance)) ||
27073 0 0 (p->longestMatchLength > mainLen + 1) ||
27074 0 0 (p->longestMatchLength + 1 >= mainLen && mainLen >= 3 && ChangePair(newDistance, mainDist)))
0 0 (p->longestMatchLength + 1 >= mainLen && mainLen >= 3 && ChangePair(newDistance, mainDist)))
27079 0 0 for (i = 0; i < LZMA_NUM_REPS; i++)
27083 0 0 if (data[0] != data2[0] || data[1] != data2[1])
0 0 if (data[0] != data2[0] || data[1] != data2[1])
27086 0 0 for (len = 2; len < limit && data[len] == data2[len]; len++);
0 0 for (len = 2; len < limit && data[len] == data2[len]; len++);
27087 0 0 if (len >= limit)
27110 0 0 if (p->result != SZ_OK)
27112 0 0 if (p->rc.res != SZ_OK)
0 0 if (p->rc.res != SZ_OK)
0 0 if (p->rc.res != SZ_OK)
27114 0 0 if (p->matchFinderBase.result != SZ_OK)
0 0 if (p->matchFinderBase.result != SZ_OK)
0 0 if (p->matchFinderBase.result != SZ_OK)
27116 0 0 if (p->result != SZ_OK)
0 0 if (p->result != SZ_OK)
0 0 if (p->result != SZ_OK)
27125 0 0 if (p->writeEndMark)
27135 0 0 for (i = 0; i < kAlignTableSize; i++)
27144 0 0 for (i = kStartPosModelIndex; i < kNumFullDistances; i++)
27152 0 0 for (lenToPosState = 0; lenToPosState < kNumLenToPosStates; lenToPosState++)
27157 0 0 for (posSlot = 0; posSlot < p->distTableSize; posSlot++)
27159 0 0 for (posSlot = kEndPosModelIndex; posSlot < p->distTableSize; posSlot++)
27165 0 0 for (i = 0; i < kStartPosModelIndex; i++)
27167 0 0 for (; i < kNumFullDistances; i++)
27198 0 0 if (p != 0)
27227 0 0 if (p->needInit)
27233 0 0 if (p->finished)
27235 0 0 RINOK(CheckErrors(p));
27240 0 0 if (p->nowPos64 == 0)
27244 0 0 if (p->matchFinder.GetNumAvailableBytes(p->matchFinderObj) == 0)
27255 0 0 if (p->matchFinder.GetNumAvailableBytes(p->matchFinderObj) != 0)
27260 0 0 if (p->fastMode)
27266 0 0 if (len == 1 && pos == (uint32_t)-1)
0 0 if (len == 1 && pos == (uint32_t)-1)
27276 0 0 if (IsCharState(p->state))
27285 0 0 if (pos < LZMA_NUM_REPS)
27288 0 0 if (pos == 0)
27297 0 0 if (pos == 1)
27303 0 0 if (pos == 3)
27310 0 0 if (len == 1)
27325 0 0 GetPosSlot(pos, posSlot);
27326 0 0 RcTree_Encode(&p->rc, p->posSlotEncoder[GetLenToPosState(len)], kNumPosSlotBits, posSlot);
27328 0 0 if (posSlot >= kStartPosModelIndex)
27334 0 0 if (posSlot < kEndPosModelIndex)
27352 0 0 if (p->additionalOffset == 0)
27355 0 0 if (!p->fastMode)
27357 0 0 if (p->matchPriceCount >= (1 << 7))
27359 0 0 if (p->alignPriceCount >= kAlignTableSize)
27362 0 0 if (p->matchFinder.GetNumAvailableBytes(p->matchFinderObj) == 0)
27365 0 0 if (useLimits)
27367 0 0 if (processed + kNumOpts + 300 >= maxUnpackSize ||
0 0 if (processed + kNumOpts + 300 >= maxUnpackSize ||
27371 0 0 else if (processed >= (1 << 15))
27387 0 0 if (!RangeEnc_Alloc(&p->rc, alloc))
27392 0 0 if (p->litProbs == 0 || p->saveState.litProbs == 0 || p->lclp != lclp)
0 0 if (p->litProbs == 0 || p->saveState.litProbs == 0 || p->lclp != lclp)
0 0 if (p->litProbs == 0 || p->saveState.litProbs == 0 || p->lclp != lclp)
27397 0 0 if (p->litProbs == 0 || p->saveState.litProbs == 0)
0 0 if (p->litProbs == 0 || p->saveState.litProbs == 0)
27408 0 0 if (beforeSize + p->dictSize < keepWindowSize)
27412 0 0 if (!MatchFinder_Create(&p->matchFinderBase, p->dictSize, beforeSize, p->numFastBytes, LZMA_MATCH_LEN_MAX, allocBig))
27424 0 0 for (i = 0 ; i < LZMA_NUM_REPS; i++)
27429 0 0 for (i = 0; i < kNumStates; i++)
27432 0 0 for (j = 0; j < LZMA_NUM_PB_STATES_MAX; j++)
27445 0 0 for (i = 0; i < num; i++)
27450 0 0 for (i = 0; i < kNumLenToPosStates; i++)
27454 0 0 for (j = 0; j < (1 << kNumPosSlotBits); j++)
27459 0 0 for (i = 0; i < kNumFullDistances - kEndPosModelIndex; i++)
27466 0 0 for (i = 0; i < (1 << kNumAlignBits); i++)
27479 0 0 if (!p->fastMode)
27495 0 0 for (i = 0; i < (uint32_t)kDicLogSizeMaxCompress; i++)
27496 0 0 if (p->dictSize <= ((uint32_t)1 << i))
27502 0 0 RINOK(LzmaEnc_Alloc(p, keepWindowSize, alloc, allocBig));
27561 0 0 if (p->rem < size)
27601 0 0 if (reInit)
27612 0 0 if (outStream.overflow)
27625 0 0 if (res != SZ_OK || p->finished != 0)
0 0 if (res != SZ_OK || p->finished != 0)
27627 0 0 if (progress != 0)
27630 0 0 if (res != SZ_OK)
27644 0 0 RINOK(LzmaEnc_Prepare(pp, outStream, inStream, alloc, allocBig));
27653 0 0 if (*size < LZMA_PROPS_SIZE)
27658 0 0 for (i = 11; i <= 30; i++)
27660 0 0 if (dictSize <= ((uint32_t)2 << i))
27665 0 0 if (dictSize <= ((uint32_t)3 << i))
27672 0 0 for (i = 0; i < 4; i++)
27696 0 0 if (res == SZ_OK)
27700 0 0 if (outStream.overflow)
27711 0 0 if (p == 0)
27715 0 0 if (res == SZ_OK)
27718 0 0 if (res == SZ_OK)
27746 0 0 auto res = lzma::LzmaEncode(compressed.data(), &compressed_size, enc.data.data(), uncompressed_size, &props, props_encoded, &props_encoded_size, 0, nullptr, &lzmaAllocator, &lzmaAllocator);
27747 0 0 if (res != SZ_OK) return false;
27750 0 0 if (uint32_t(uncompressed_size) != uncompressed_size || uint32_t(compressed_size) != compressed_size) return false;
0 0 if (uint32_t(uncompressed_size) != uncompressed_size || uint32_t(compressed_size) != compressed_size) return false;
27751 0 0 if (!os.write((const char*) &uncompressed_size, sizeof(uint32_t))) return false;
0 0 if (!os.write((const char*) &uncompressed_size, sizeof(uint32_t))) return false;
27752 0 0 if (!os.write((const char*) &compressed_size, sizeof(uint32_t))) return false;
0 0 if (!os.write((const char*) &compressed_size, sizeof(uint32_t))) return false;
27753 0 0 if (!os.write((const char*) &poor_crc, sizeof(uint32_t))) return false;
0 0 if (!os.write((const char*) &poor_crc, sizeof(uint32_t))) return false;
27754 0 0 if (!os.write((const char*) props_encoded, sizeof(props_encoded))) return false;
0 0 if (!os.write((const char*) props_encoded, sizeof(props_encoded))) return false;
27755 0 0 if (!os.write((const char*) compressed.data(), compressed_size)) return false;
0 0 if (!os.write((const char*) compressed.data(), compressed_size)) return false;
27777 0 0 return {1, 3, 0, ""};
27790 0 0 << (udpipe.prerelease.empty() ? "" : "-") << udpipe.prerelease
0 0 << (udpipe.prerelease.empty() ? "" : "-") << udpipe.prerelease
27792 0 0 << (unilib.prerelease.empty() ? "" : "-") << unilib.prerelease
0 0 << (unilib.prerelease.empty() ? "" : "-") << unilib.prerelease
27794 0 0 << (morphodita.prerelease.empty() ? "" : "-") << morphodita.prerelease
0 0 << (morphodita.prerelease.empty() ? "" : "-") << morphodita.prerelease
27796 0 0 << (parsito.prerelease.empty() ? "" : "-") << parsito.prerelease
0 0 << (parsito.prerelease.empty() ? "" : "-") << parsito.prerelease
27797 0 0 << (other_libraries.empty() ? "" : " and ") << other_libraries << ")\n"
0 0 << (other_libraries.empty() ? "" : " and ") << other_libraries << ")\n"
27799 0 0 "Mathematics and Physics, Charles University in Prague, Czech Republic.";
27805 2 0 } // namespace ufal
2 0 } // namespace ufal