line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Treex::Core::Types; |
2
|
|
|
|
|
|
|
$Treex::Core::Types::VERSION = '2.20210102'; |
3
|
27
|
|
|
27
|
|
192
|
use strict; |
|
27
|
|
|
|
|
71
|
|
|
27
|
|
|
|
|
863
|
|
4
|
27
|
|
|
27
|
|
143
|
use warnings; |
|
27
|
|
|
|
|
55
|
|
|
27
|
|
|
|
|
661
|
|
5
|
27
|
|
|
27
|
|
141
|
use utf8; |
|
27
|
|
|
|
|
49
|
|
|
27
|
|
|
|
|
209
|
|
6
|
27
|
|
|
27
|
|
754
|
use Moose::Util::TypeConstraints; |
|
27
|
|
|
|
|
62
|
|
|
27
|
|
|
|
|
397
|
|
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
subtype 'Treex::Type::NonNegativeInt' |
9
|
|
|
|
|
|
|
=> as 'Int' |
10
|
|
|
|
|
|
|
=> where { $_ >= 0 } |
11
|
|
|
|
|
|
|
=> message {"$_ isn't non-negative"}; |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
subtype 'Treex::Type::Selector' |
14
|
|
|
|
|
|
|
=> as 'Str' |
15
|
|
|
|
|
|
|
=> where {m/^[a-z\d]*$/i} |
16
|
|
|
|
|
|
|
=> message {"Selector must =~ /^[a-z\\d]*\$/i. You've provided $_"}; #TODO: this message is not printed |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
subtype 'Treex::Type::Layer' |
19
|
|
|
|
|
|
|
=> as 'Str' |
20
|
|
|
|
|
|
|
=> where {m/^[ptan]$/i} |
21
|
|
|
|
|
|
|
=> message {"Layer must be one of: [P]hrase structure, [T]ectogrammatical, [A]nalytical, [N]amed entities, you've provided $_"}; |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
sub layers { |
24
|
1
|
|
|
1
|
1
|
5
|
return qw(A T P N); |
25
|
|
|
|
|
|
|
} |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
subtype 'Treex::Type::Message' #nonempty string |
28
|
|
|
|
|
|
|
=> as 'Str' |
29
|
|
|
|
|
|
|
=> where { $_ ne q{} } |
30
|
|
|
|
|
|
|
=> message {'Message must be nonempty'}; |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
#preparation for possible future constraints |
33
|
|
|
|
|
|
|
subtype 'Treex::Type::Id' |
34
|
|
|
|
|
|
|
=> as 'Str'; |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
# TODO: Should this be named ZoneCode or ZoneLabel? |
37
|
|
|
|
|
|
|
subtype 'Treex::Type::ZoneCode' |
38
|
|
|
|
|
|
|
=> as 'Str' |
39
|
|
|
|
|
|
|
=> where { my ( $l, $s ) = split /_/, $_; is_lang_code($l) && ( !defined $s || $s =~ /^[a-z\d]*$/i ) } |
40
|
|
|
|
|
|
|
=> message {'ZoneCode must be LangCode or LangCode_Selector, e.g. "en_src"'}; |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
# ISO 639-1 language code with some extensions from ISO 639-2, 639-3 and ISO 15924 (script names). |
43
|
|
|
|
|
|
|
# Added code for Modern Greek which comes under ISO 639-3 (but normally it is encoded using ISO 639-1 'el'). |
44
|
27
|
|
|
27
|
|
85026
|
use Locale::Language; |
|
27
|
|
|
|
|
5699788
|
|
|
27
|
|
|
|
|
11393
|
|
45
|
|
|
|
|
|
|
my %EXTRA_LANG_CODES = ( |
46
|
|
|
|
|
|
|
'abq' => "Abaza", |
47
|
|
|
|
|
|
|
'aii' => "Assyrian", |
48
|
|
|
|
|
|
|
'ajp' => "South Levantine Arabic", |
49
|
|
|
|
|
|
|
'akk' => "Akkadian", |
50
|
|
|
|
|
|
|
'apu' => "Apurina", # Apurinã |
51
|
|
|
|
|
|
|
'aqz' => "Akuntsu", |
52
|
|
|
|
|
|
|
'bho' => "Bhojpuri", |
53
|
|
|
|
|
|
|
'bxr' => "Buryat", |
54
|
|
|
|
|
|
|
'ckb' => "Sorani", # Central Kurdish |
55
|
|
|
|
|
|
|
'ckt' => "Chukchi", |
56
|
|
|
|
|
|
|
'cop' => "Coptic", # ISO 639-2 |
57
|
|
|
|
|
|
|
'dbl' => "Dyirbal", |
58
|
|
|
|
|
|
|
'dsb' => "Lower Sorbian", |
59
|
|
|
|
|
|
|
'ell' => "Modern Greek", # ISO 639-3 |
60
|
|
|
|
|
|
|
'fro' => "Old French", |
61
|
|
|
|
|
|
|
'got' => "Gothic", # ISO 639-2 |
62
|
|
|
|
|
|
|
'grc' => "Ancient Greek", # ISO 639-2 |
63
|
|
|
|
|
|
|
'gsw' => "Swiss German", |
64
|
|
|
|
|
|
|
'gun' => "Mbya Guarani", |
65
|
|
|
|
|
|
|
'hit' => "Hittite", # ISO 639-2 |
66
|
|
|
|
|
|
|
'hsb' => "Upper Sorbian", |
67
|
|
|
|
|
|
|
'hak' => "Hakka", |
68
|
|
|
|
|
|
|
'kaa' => "Karakalpak", |
69
|
|
|
|
|
|
|
'kfm' => "Khunsari", |
70
|
|
|
|
|
|
|
'kmr' => "Kurmanji", # Northern Kurdish |
71
|
|
|
|
|
|
|
'koi' => "Komi Permyak", |
72
|
|
|
|
|
|
|
'kpv' => "Komi Zyrian", |
73
|
|
|
|
|
|
|
'krl' => "Karelian", |
74
|
|
|
|
|
|
|
'ku-latn' => "Kurdish in Latin script", |
75
|
|
|
|
|
|
|
'ku-arab' => "Kurdish in Arabic script", |
76
|
|
|
|
|
|
|
'ku-cyrl' => "Kurdish in Cyrillic script", |
77
|
|
|
|
|
|
|
'lzh' => "Classical Chinese", |
78
|
|
|
|
|
|
|
'mdf' => "Moksha", |
79
|
|
|
|
|
|
|
'mga' => "Middle Irish", |
80
|
|
|
|
|
|
|
'mul' => "multiple languages", # ISO 639-2 code |
81
|
|
|
|
|
|
|
'myu' => "Munduruku", # Mundurukú |
82
|
|
|
|
|
|
|
'myv' => "Erzya", |
83
|
|
|
|
|
|
|
'nan' => "Taiwanese", |
84
|
|
|
|
|
|
|
'ndg' => "Ndengeleko", |
85
|
|
|
|
|
|
|
'nyq' => "Nayini", |
86
|
|
|
|
|
|
|
'olo' => "Livvi", # Olonets Karelian |
87
|
|
|
|
|
|
|
'orv' => "Old Russian", |
88
|
|
|
|
|
|
|
'otk' => "Old Turkish", |
89
|
|
|
|
|
|
|
'pcm' => "Nigerian Pidgin (Naija)", |
90
|
|
|
|
|
|
|
'pgl' => "Archaic Irish", |
91
|
|
|
|
|
|
|
'rmy' => "Romany", |
92
|
|
|
|
|
|
|
'qhe' => "Hindi-English", # used in UD bilingual corpora |
93
|
|
|
|
|
|
|
'qtd' => "Turkish-German", # used in UD bilingual corpora |
94
|
|
|
|
|
|
|
'quz' => "Cusco Quechua", |
95
|
|
|
|
|
|
|
'sah' => "Yakut", |
96
|
|
|
|
|
|
|
'sga' => "Old Irish", |
97
|
|
|
|
|
|
|
'sme' => "North Sami", |
98
|
|
|
|
|
|
|
'sms' => "Skolt Sami", |
99
|
|
|
|
|
|
|
'soj' => "Soi", |
100
|
|
|
|
|
|
|
'swl' => "Swedish Sign Language", |
101
|
|
|
|
|
|
|
'tpn' => "Tupinamba", # Tupinambá |
102
|
|
|
|
|
|
|
'und' => "unknown", # ISO 639-2 code for undetermined/unknown language |
103
|
|
|
|
|
|
|
'xal' => "Kalmyk", |
104
|
|
|
|
|
|
|
'wbp' => "Warlpiri", |
105
|
|
|
|
|
|
|
'yii' => "Yidiny", |
106
|
|
|
|
|
|
|
'yue' => "Cantonese", |
107
|
|
|
|
|
|
|
); |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
my %IS_LANG_CODE = map { $_ => 1 } ( all_language_codes(), keys %EXTRA_LANG_CODES ); |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
subtype 'Treex::Type::LangCode' |
112
|
|
|
|
|
|
|
=> as 'Str' |
113
|
|
|
|
|
|
|
=> where { defined $IS_LANG_CODE{$_} } |
114
|
|
|
|
|
|
|
=> message {'LangCode must be valid ISO 639-1 code. E.g. en, de, cs'}; |
115
|
11
|
|
|
11
|
1
|
67
|
sub is_lang_code { return $IS_LANG_CODE{ $_[0] }; } |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
sub get_lang_name { |
118
|
0
|
|
|
0
|
1
|
|
my $code = shift; |
119
|
0
|
0
|
|
|
|
|
return exists $EXTRA_LANG_CODES{$code} ? $EXTRA_LANG_CODES{$code} : code2language($code); |
120
|
|
|
|
|
|
|
} |
121
|
|
|
|
|
|
|
1; |
122
|
|
|
|
|
|
|
__END__ |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
=encoding utf-8 |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
=head1 NAME |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
Treex::Core::Types - types used in Treex framework |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
=head1 VERSION |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
version 2.20210102 |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
=head1 DESCRIPTION |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
=head1 TYPES |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
=over 4 |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
=item Treex::Type::NonNegativeInt |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
0, 1, 2, ... |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
=item Treex::Type::Layer |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
one of: P, T, A, N |
147
|
|
|
|
|
|
|
case insensitive |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
=item Treex::Type::Selector |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
Selector - only alphanumeric characters, may be empty |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
=item Treex::Type::LangCode |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
ISO 639-1 code |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
=item Treex::Type::ZoneCode |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
Combination of LangCode and Selector, e.g. "en_src" |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
=item Treex::Type::Message |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
just nonempty string, future constraints may be set |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
=item Treex::Type::Id |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
identifier, prepared for future constraints, now it is any string |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
=back |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
=head1 METHODS |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
=over 4 |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
=item get_lang_name |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
Returns language name for given LangCode |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
=item is_lang_code |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
Checks whether given argument is valid LangCode |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
=item layers |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
Returns array of layers available in Treex, now (A, T, P, N) |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
=back |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
=head1 AUTHOR |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
Tomáš Kraut <kraut@ufal.mff.cuni.cz> |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
Copyright © 2011 by Institute of Formal and Applied Linguistics, Charles University in Prague |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. |