| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Treex::Core::Types; |
|
2
|
|
|
|
|
|
|
$Treex::Core::Types::VERSION = '2.20210102'; |
|
3
|
27
|
|
|
27
|
|
192
|
use strict; |
|
|
27
|
|
|
|
|
71
|
|
|
|
27
|
|
|
|
|
863
|
|
|
4
|
27
|
|
|
27
|
|
143
|
use warnings; |
|
|
27
|
|
|
|
|
55
|
|
|
|
27
|
|
|
|
|
661
|
|
|
5
|
27
|
|
|
27
|
|
141
|
use utf8; |
|
|
27
|
|
|
|
|
49
|
|
|
|
27
|
|
|
|
|
209
|
|
|
6
|
27
|
|
|
27
|
|
754
|
use Moose::Util::TypeConstraints; |
|
|
27
|
|
|
|
|
62
|
|
|
|
27
|
|
|
|
|
397
|
|
|
7
|
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
subtype 'Treex::Type::NonNegativeInt' |
|
9
|
|
|
|
|
|
|
=> as 'Int' |
|
10
|
|
|
|
|
|
|
=> where { $_ >= 0 } |
|
11
|
|
|
|
|
|
|
=> message {"$_ isn't non-negative"}; |
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
subtype 'Treex::Type::Selector' |
|
14
|
|
|
|
|
|
|
=> as 'Str' |
|
15
|
|
|
|
|
|
|
=> where {m/^[a-z\d]*$/i} |
|
16
|
|
|
|
|
|
|
=> message {"Selector must =~ /^[a-z\\d]*\$/i. You've provided $_"}; #TODO: this message is not printed |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
subtype 'Treex::Type::Layer' |
|
19
|
|
|
|
|
|
|
=> as 'Str' |
|
20
|
|
|
|
|
|
|
=> where {m/^[ptan]$/i} |
|
21
|
|
|
|
|
|
|
=> message {"Layer must be one of: [P]hrase structure, [T]ectogrammatical, [A]nalytical, [N]amed entities, you've provided $_"}; |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
sub layers { |
|
24
|
1
|
|
|
1
|
1
|
5
|
return qw(A T P N); |
|
25
|
|
|
|
|
|
|
} |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
subtype 'Treex::Type::Message' #nonempty string |
|
28
|
|
|
|
|
|
|
=> as 'Str' |
|
29
|
|
|
|
|
|
|
=> where { $_ ne q{} } |
|
30
|
|
|
|
|
|
|
=> message {'Message must be nonempty'}; |
|
31
|
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
#preparation for possible future constraints |
|
33
|
|
|
|
|
|
|
subtype 'Treex::Type::Id' |
|
34
|
|
|
|
|
|
|
=> as 'Str'; |
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
# TODO: Should this be named ZoneCode or ZoneLabel? |
|
37
|
|
|
|
|
|
|
subtype 'Treex::Type::ZoneCode' |
|
38
|
|
|
|
|
|
|
=> as 'Str' |
|
39
|
|
|
|
|
|
|
=> where { my ( $l, $s ) = split /_/, $_; is_lang_code($l) && ( !defined $s || $s =~ /^[a-z\d]*$/i ) } |
|
40
|
|
|
|
|
|
|
=> message {'ZoneCode must be LangCode or LangCode_Selector, e.g. "en_src"'}; |
|
41
|
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
# ISO 639-1 language code with some extensions from ISO 639-2, 639-3 and ISO 15924 (script names). |
|
43
|
|
|
|
|
|
|
# Added code for Modern Greek which comes under ISO 639-3 (but normally it is encoded using ISO 639-1 'el'). |
|
44
|
27
|
|
|
27
|
|
85026
|
use Locale::Language; |
|
|
27
|
|
|
|
|
5699788
|
|
|
|
27
|
|
|
|
|
11393
|
|
|
45
|
|
|
|
|
|
|
my %EXTRA_LANG_CODES = ( |
|
46
|
|
|
|
|
|
|
'abq' => "Abaza", |
|
47
|
|
|
|
|
|
|
'aii' => "Assyrian", |
|
48
|
|
|
|
|
|
|
'ajp' => "South Levantine Arabic", |
|
49
|
|
|
|
|
|
|
'akk' => "Akkadian", |
|
50
|
|
|
|
|
|
|
'apu' => "Apurina", # Apurinã |
|
51
|
|
|
|
|
|
|
'aqz' => "Akuntsu", |
|
52
|
|
|
|
|
|
|
'bho' => "Bhojpuri", |
|
53
|
|
|
|
|
|
|
'bxr' => "Buryat", |
|
54
|
|
|
|
|
|
|
'ckb' => "Sorani", # Central Kurdish |
|
55
|
|
|
|
|
|
|
'ckt' => "Chukchi", |
|
56
|
|
|
|
|
|
|
'cop' => "Coptic", # ISO 639-2 |
|
57
|
|
|
|
|
|
|
'dbl' => "Dyirbal", |
|
58
|
|
|
|
|
|
|
'dsb' => "Lower Sorbian", |
|
59
|
|
|
|
|
|
|
'ell' => "Modern Greek", # ISO 639-3 |
|
60
|
|
|
|
|
|
|
'fro' => "Old French", |
|
61
|
|
|
|
|
|
|
'got' => "Gothic", # ISO 639-2 |
|
62
|
|
|
|
|
|
|
'grc' => "Ancient Greek", # ISO 639-2 |
|
63
|
|
|
|
|
|
|
'gsw' => "Swiss German", |
|
64
|
|
|
|
|
|
|
'gun' => "Mbya Guarani", |
|
65
|
|
|
|
|
|
|
'hit' => "Hittite", # ISO 639-2 |
|
66
|
|
|
|
|
|
|
'hsb' => "Upper Sorbian", |
|
67
|
|
|
|
|
|
|
'hak' => "Hakka", |
|
68
|
|
|
|
|
|
|
'kaa' => "Karakalpak", |
|
69
|
|
|
|
|
|
|
'kfm' => "Khunsari", |
|
70
|
|
|
|
|
|
|
'kmr' => "Kurmanji", # Northern Kurdish |
|
71
|
|
|
|
|
|
|
'koi' => "Komi Permyak", |
|
72
|
|
|
|
|
|
|
'kpv' => "Komi Zyrian", |
|
73
|
|
|
|
|
|
|
'krl' => "Karelian", |
|
74
|
|
|
|
|
|
|
'ku-latn' => "Kurdish in Latin script", |
|
75
|
|
|
|
|
|
|
'ku-arab' => "Kurdish in Arabic script", |
|
76
|
|
|
|
|
|
|
'ku-cyrl' => "Kurdish in Cyrillic script", |
|
77
|
|
|
|
|
|
|
'lzh' => "Classical Chinese", |
|
78
|
|
|
|
|
|
|
'mdf' => "Moksha", |
|
79
|
|
|
|
|
|
|
'mga' => "Middle Irish", |
|
80
|
|
|
|
|
|
|
'mul' => "multiple languages", # ISO 639-2 code |
|
81
|
|
|
|
|
|
|
'myu' => "Munduruku", # Mundurukú |
|
82
|
|
|
|
|
|
|
'myv' => "Erzya", |
|
83
|
|
|
|
|
|
|
'nan' => "Taiwanese", |
|
84
|
|
|
|
|
|
|
'ndg' => "Ndengeleko", |
|
85
|
|
|
|
|
|
|
'nyq' => "Nayini", |
|
86
|
|
|
|
|
|
|
'olo' => "Livvi", # Olonets Karelian |
|
87
|
|
|
|
|
|
|
'orv' => "Old Russian", |
|
88
|
|
|
|
|
|
|
'otk' => "Old Turkish", |
|
89
|
|
|
|
|
|
|
'pcm' => "Nigerian Pidgin (Naija)", |
|
90
|
|
|
|
|
|
|
'pgl' => "Archaic Irish", |
|
91
|
|
|
|
|
|
|
'rmy' => "Romany", |
|
92
|
|
|
|
|
|
|
'qhe' => "Hindi-English", # used in UD bilingual corpora |
|
93
|
|
|
|
|
|
|
'qtd' => "Turkish-German", # used in UD bilingual corpora |
|
94
|
|
|
|
|
|
|
'quz' => "Cusco Quechua", |
|
95
|
|
|
|
|
|
|
'sah' => "Yakut", |
|
96
|
|
|
|
|
|
|
'sga' => "Old Irish", |
|
97
|
|
|
|
|
|
|
'sme' => "North Sami", |
|
98
|
|
|
|
|
|
|
'sms' => "Skolt Sami", |
|
99
|
|
|
|
|
|
|
'soj' => "Soi", |
|
100
|
|
|
|
|
|
|
'swl' => "Swedish Sign Language", |
|
101
|
|
|
|
|
|
|
'tpn' => "Tupinamba", # Tupinambá |
|
102
|
|
|
|
|
|
|
'und' => "unknown", # ISO 639-2 code for undetermined/unknown language |
|
103
|
|
|
|
|
|
|
'xal' => "Kalmyk", |
|
104
|
|
|
|
|
|
|
'wbp' => "Warlpiri", |
|
105
|
|
|
|
|
|
|
'yii' => "Yidiny", |
|
106
|
|
|
|
|
|
|
'yue' => "Cantonese", |
|
107
|
|
|
|
|
|
|
); |
|
108
|
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
my %IS_LANG_CODE = map { $_ => 1 } ( all_language_codes(), keys %EXTRA_LANG_CODES ); |
|
110
|
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
subtype 'Treex::Type::LangCode' |
|
112
|
|
|
|
|
|
|
=> as 'Str' |
|
113
|
|
|
|
|
|
|
=> where { defined $IS_LANG_CODE{$_} } |
|
114
|
|
|
|
|
|
|
=> message {'LangCode must be valid ISO 639-1 code. E.g. en, de, cs'}; |
|
115
|
11
|
|
|
11
|
1
|
67
|
sub is_lang_code { return $IS_LANG_CODE{ $_[0] }; } |
|
116
|
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
sub get_lang_name { |
|
118
|
0
|
|
|
0
|
1
|
|
my $code = shift; |
|
119
|
0
|
0
|
|
|
|
|
return exists $EXTRA_LANG_CODES{$code} ? $EXTRA_LANG_CODES{$code} : code2language($code); |
|
120
|
|
|
|
|
|
|
} |
|
121
|
|
|
|
|
|
|
1; |
|
122
|
|
|
|
|
|
|
__END__ |
|
123
|
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
=encoding utf-8 |
|
125
|
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
=head1 NAME |
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
Treex::Core::Types - types used in Treex framework |
|
129
|
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
=head1 VERSION |
|
131
|
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
version 2.20210102 |
|
133
|
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
135
|
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
=head1 TYPES |
|
137
|
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
=over 4 |
|
139
|
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
=item Treex::Type::NonNegativeInt |
|
141
|
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
0, 1, 2, ... |
|
143
|
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
=item Treex::Type::Layer |
|
145
|
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
one of: P, T, A, N |
|
147
|
|
|
|
|
|
|
case insensitive |
|
148
|
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
=item Treex::Type::Selector |
|
150
|
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
Selector - only alphanumeric characters, may be empty |
|
152
|
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
=item Treex::Type::LangCode |
|
154
|
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
ISO 639-1 code |
|
156
|
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
=item Treex::Type::ZoneCode |
|
158
|
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
Combination of LangCode and Selector, e.g. "en_src" |
|
160
|
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
=item Treex::Type::Message |
|
162
|
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
just nonempty string, future constraints may be set |
|
164
|
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
=item Treex::Type::Id |
|
166
|
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
identifier, prepared for future constraints, now it is any string |
|
168
|
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
=back |
|
170
|
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
=head1 METHODS |
|
172
|
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
=over 4 |
|
174
|
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
=item get_lang_name |
|
176
|
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
Returns language name for given LangCode |
|
178
|
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
=item is_lang_code |
|
180
|
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
Checks whether given argument is valid LangCode |
|
182
|
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
=item layers |
|
184
|
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
Returns array of layers available in Treex, now (A, T, P, N) |
|
186
|
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
=back |
|
188
|
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
=head1 AUTHOR |
|
190
|
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
Tomáš Kraut <kraut@ufal.mff.cuni.cz> |
|
192
|
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
|
194
|
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
Copyright © 2011 by Institute of Formal and Applied Linguistics, Charles University in Prague |
|
196
|
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. |