lib/MKDoc/XML/Tagger.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 106 | 109 | 97.2 |
branch | 4 | 4 | 100.0 |
condition | 2 | 4 | 50.0 |
subroutine | 12 | 13 | 92.3 |
pod | 2 | 2 | 100.0 |
total | 126 | 132 | 95.4 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | # ------------------------------------------------------------------------------------- | ||||||
2 | # MKDoc::XML::Tagger | ||||||
3 | # ------------------------------------------------------------------------------------- | ||||||
4 | # Author : Jean-Michel Hiver. | ||||||
5 | # Copyright : (c) MKDoc Holdings Ltd, 2003 | ||||||
6 | # | ||||||
7 | # This module adds markup to an existing XML file / variable by matching expression. | ||||||
8 | # You could see it as an XML-compatible search and substitute module. | ||||||
9 | # | ||||||
10 | # The main reason it exists is to automagically hyperlink HTML in MKDoc, and also to | ||||||
11 | # mark up properly abbreviations based on glossaries. | ||||||
12 | # | ||||||
13 | # This module is distributed under the same license as Perl itself. | ||||||
14 | # ------------------------------------------------------------------------------------- | ||||||
15 | package MKDoc::XML::Tagger; | ||||||
16 | 6 | 6 | 50584 | use MKDoc::XML::Tokenizer; | |||
6 | 19 | ||||||
6 | 297 | ||||||
17 | 6 | 6 | 38 | use strict; | |||
6 | 13 | ||||||
6 | 203 | ||||||
18 | 6 | 6 | 29 | use warnings; | |||
6 | 11 | ||||||
6 | 151 | ||||||
19 | 6 | 6 | 8222 | use utf8; | |||
6 | 106 | ||||||
6 | 33 | ||||||
20 | |||||||
21 | our $tags = []; | ||||||
22 | our $Ignorable_RE = qr /(?:\r|\n|\s|(?:\&\(\d+\)))*/; | ||||||
23 | |||||||
24 | our @DONT_TAG = qw/a/; | ||||||
25 | |||||||
26 | ## | ||||||
27 | # $class->process_data ($xml, @expressions); | ||||||
28 | # ------------------------------------------ | ||||||
29 | # Tags $xml with @expressions, where expression is a list of hashes. | ||||||
30 | # | ||||||
31 | # For example: | ||||||
32 | # | ||||||
33 | # MKDoc::XML::Tagger->process ( | ||||||
34 | # 'I like oranges and bananas', | ||||||
35 | # { _expr => 'oranges', _tag => 'a', href => 'http://www.google.com?q=oranges' }, | ||||||
36 | # { _expr => 'bananas', _tag => 'a', href => 'http://www.google.com?q=bananas' }, | ||||||
37 | # | ||||||
38 | # Will return | ||||||
39 | # | ||||||
40 | # 'I like oranges and \ | ||||||
41 | # bananas. | ||||||
42 | ## | ||||||
43 | sub process_data | ||||||
44 | { | ||||||
45 | 14 | 14 | 1 | 8833 | my $class = shift; | ||
46 | 14 | 98 | my $tokens = MKDoc::XML::Tokenizer->process_data (shift); | ||||
47 | 14 | 55 | return _replace ($tokens, @_); | ||||
48 | } | ||||||
49 | |||||||
50 | |||||||
51 | ## | ||||||
52 | # $class->process_file ($file, @expressions); | ||||||
53 | # ------------------------------------------- | ||||||
54 | # Same as $class->process_data ($data, @expressions), except that $data is read | ||||||
55 | # from $file. | ||||||
56 | ## | ||||||
57 | sub process_file | ||||||
58 | { | ||||||
59 | 0 | 0 | 1 | 0 | my $class = shift; | ||
60 | 0 | 0 | my $tokens = MKDoc::XML::Tokenizer->process_file (shift); | ||||
61 | 0 | 0 | return _replace ($tokens, @_); | ||||
62 | } | ||||||
63 | |||||||
64 | |||||||
65 | ## | ||||||
66 | # _replace ($tokens, @expressions); | ||||||
67 | # --------------------------------- | ||||||
68 | # This function constructs the newly marked up text from a list | ||||||
69 | # of XML $tokens and a list of @expressions and returns it. | ||||||
70 | # | ||||||
71 | # Longest expressions are applied first. | ||||||
72 | ## | ||||||
73 | sub _replace | ||||||
74 | { | ||||||
75 | 14 | 14 | 26 | my $tokens = shift; | |||
76 | 14 | 56 | my @expr = sort { length ($b->{_expr}) <=> length ($a->{_expr}) } @_; | ||||
2 | 12 | ||||||
77 | |||||||
78 | 16 | 33 | @expr = map { | ||||
79 | 14 | 31 | my $hash = \%{$_}; | ||||
16 | 21 | ||||||
80 | 16 | 38 | for (keys %{$hash}) { | ||||
16 | 69 | ||||||
81 | 51 | 185 | $hash->{$_} =~ s/\&/\&/g; | ||||
82 | 51 | 163 | $hash->{$_} =~ s/\\</g; | ||||
83 | 51 | 103 | $hash->{$_} =~ s/\>/\>/g; | ||||
84 | 51 | 483 | $hash->{$_} =~ s/\"/\"/g; | ||||
85 | }; | ||||||
86 | 16 | 60 | $hash; | ||||
87 | } @expr; | ||||||
88 | |||||||
89 | 14 | 24 | my $text; local $tags; | ||||
14 | 19 | ||||||
90 | 14 | 52 | ($text, $tags) = _segregate_markup_from_text ($tokens); | ||||
91 | |||||||
92 | # once we have segregated markup from the text, we can safely | ||||||
93 | # encode < and > and "... | ||||||
94 | # $text =~ s/\&/\&/g; # seems to be already encoded... where do we encode this stuff !?! | ||||||
95 | 14 | 36 | $text =~ s/\\</g; | ||||
96 | 14 | 26 | $text =~ s/\>/\>/g; | ||||
97 | 14 | 43 | $text =~ s/\"/\"/g; | ||||
98 | |||||||
99 | # but we don't want any ' | ||||||
100 | 14 | 35 | $text =~ s/\'/\'/g; | ||||
101 | |||||||
102 | # @expr = _filter_out ($text, @expr); | ||||||
103 | 14 | 51 | while (my $attr = shift (@expr)) | ||||
104 | { | ||||||
105 | 16 | 24 | my %attr = %{$attr}; | ||||
16 | 84 | ||||||
106 | 16 | 50 | 70 | my $tag = delete $attr{_tag} || next; | |||
107 | 16 | 50 | 58 | my $expr = delete $attr{_expr} || next; | |||
108 | 16 | 92 | $text = _text_replace ($text, $expr, $tag, \%attr); | ||||
109 | } | ||||||
110 | |||||||
111 | 14 | 361 | while ($text =~ /\&\(\d+\)/) | ||||
112 | { | ||||||
113 | 15 | 37 | for (my $i = 0; $i < @{$tags}; $i++) | ||||
57 | 193 | ||||||
114 | { | ||||||
115 | 42 | 137 | my $c = $i + 1; | ||||
116 | 42 | 75 | my $tag = $tags->[$i]; | ||||
117 | 42 | 580 | $text =~ s/\&\($c\)/$tag/g; | ||||
118 | } | ||||||
119 | } | ||||||
120 | |||||||
121 | 14 | 140 | return $text; | ||||
122 | } | ||||||
123 | |||||||
124 | |||||||
125 | ## | ||||||
126 | # _text_replace ($text, $expr, $tag, $attr); | ||||||
127 | # ------------------------------------------ | ||||||
128 | # Replaces all $text, $expr, $tag, $attr. | ||||||
129 | ## | ||||||
130 | sub _text_replace | ||||||
131 | { | ||||||
132 | 16 | 16 | 31 | my $text = shift; | |||
133 | 16 | 26 | my $expr = shift; | ||||
134 | 16 | 36 | my $tag = shift; | ||||
135 | 16 | 32 | my $attr = shift; | ||||
136 | |||||||
137 | 16 | 39 | my $re = _expression_to_regex ($expr); | ||||
138 | 16 | 46 | my $tag1 = _tag_open ($tag, $attr); | ||||
139 | 16 | 55 | my $tag2 = _tag_close ($tag, $attr); | ||||
140 | |||||||
141 | # let's treat beginning and end of string as spaces, | ||||||
142 | # it makes the regular expressions much easier. | ||||||
143 | 16 | 63 | $text = " $text "; | ||||
144 | |||||||
145 | 6 | 6 | 75 | my %expr = map { $_ => 1 } $text =~ | |||
6 | 11 | ||||||
6 | 278 | ||||||
16 | 614 | ||||||
14 | 215385 | ||||||
146 | /(?<=\p{IsSpace}|\p{IsPunct}|\&)($re)(?=\p{IsSpace}|\p{IsPunct}|\&)/gi; | ||||||
147 | |||||||
148 | 16 | 26308 | foreach (keys %expr) | ||||
149 | { | ||||||
150 | 13 | 35 | my $to_replace = quotemeta ($_); | ||||
151 | 13 | 27 | my $replacement = $_; | ||||
152 | 13 | 39 | $replacement =~ s/(\&\(\d+\))/$tag2$1$tag1/g; | ||||
153 | 13 | 40 | $replacement = "$tag1$replacement$tag2"; | ||||
154 | |||||||
155 | # Double hyperlinking fix | ||||||
156 | # JM - 2004-01-23 | ||||||
157 | 13 | 125 | push @{$tags}, $replacement; | ||||
13 | 38 | ||||||
158 | 13 | 27 | my $rep = '&(' . @{$tags} . ')'; | ||||
13 | 159 | ||||||
159 | 13 | 295 | $text =~ s/(?<=\p{IsSpace}|\p{IsPunct}|\&)$to_replace(?=\p{IsSpace}|\p{IsPunct}|\&)/$rep/g; | ||||
160 | # matching placeholders fix Bruno 2005-03-10 | ||||||
161 | 13 | 4738 | my $rep_quoted = quotemeta ($rep); | ||||
162 | 13 | 144 | $text =~ s/&\($rep_quoted\)/&($to_replace)/g; | ||||
163 | } | ||||||
164 | |||||||
165 | # remove the first and last space which we previously inserted for | ||||||
166 | # ease-of-regex purposes. | ||||||
167 | 16 | 78 | $text =~ s/^ //; | ||||
168 | 16 | 162 | $text =~ s/ $//; | ||||
169 | 16 | 198 | return $text; | ||||
170 | } | ||||||
171 | |||||||
172 | |||||||
173 | ## | ||||||
174 | # _segregate_markup_from_text ($tokens); | ||||||
175 | # -------------------------------------- | ||||||
176 | # From an array reference of tokens, returns text with | ||||||
177 | # placeholders for markup, followed by an array reference | ||||||
178 | # of markup tokens. | ||||||
179 | # | ||||||
180 | # Example: | ||||||
181 | # | ||||||
182 | # [ '', 'Hello ', ' ', 'World', '' ] |
||||||
183 | # | ||||||
184 | # becomes | ||||||
185 | # | ||||||
186 | # ( '&(1)Hello &(2)World&(3)', [ '', ' ', '' ] ) |
||||||
187 | ## | ||||||
188 | sub _segregate_markup_from_text | ||||||
189 | { | ||||||
190 | 15 | 15 | 34 | my $tokens = shift; | |||
191 | 15 | 26 | my @tags = (); | ||||
192 | 15 | 38 | my $res = ''; | ||||
193 | |||||||
194 | 15 | 95 | for (@{$tokens}) | ||||
15 | 36 | ||||||
195 | { | ||||||
196 | 50 | 108 | $_ = $$_; # replace the token object by its value | ||||
197 | 50 | 100 | 364 | /^ and do { | |||
198 | 28 | 48 | push @tags, $_; | ||||
199 | 28 | 152 | $res .= '&(' . @tags . ')'; | ||||
200 | 28 | 57 | next; | ||||
201 | }; | ||||||
202 | 22 | 55 | $res .= $_; | ||||
203 | } | ||||||
204 | |||||||
205 | 15 | 62 | return $res, \@tags; | ||||
206 | } | ||||||
207 | |||||||
208 | |||||||
209 | ## | ||||||
210 | # _expression_to_regex ($expr); | ||||||
211 | # ----------------------------- | ||||||
212 | # Turns $expr into a regular expression that will match | ||||||
213 | # all segregated text which should match this expression. | ||||||
214 | ## | ||||||
215 | sub _expression_to_regex | ||||||
216 | { | ||||||
217 | 16 | 16 | 27 | my $text = shift; | |||
218 | 16 | 35 | $text = lc ($text); | ||||
219 | 16 | 58 | $text =~ s/^(?:\s|\r|\n)+//; | ||||
220 | 16 | 76 | $text =~ s/(?:\s|\r|\n)+$//; | ||||
221 | |||||||
222 | 16 | 102 | my @split = split /(?:\s|\r|\n)+/, $text; | ||||
223 | 16 | 36 | $text = join $Ignorable_RE, map { quotemeta ($_) } @split; | ||||
23 | 84 | ||||||
224 | |||||||
225 | 16 | 49 | return $text; | ||||
226 | } | ||||||
227 | |||||||
228 | |||||||
229 | ## | ||||||
230 | # _tag_open ($tag_name, $tag_attributes); | ||||||
231 | # --------------------------------------- | ||||||
232 | # Turns a structure representing an opening tag into | ||||||
233 | # a string representing an opening tag. | ||||||
234 | ## | ||||||
235 | sub _tag_open | ||||||
236 | { | ||||||
237 | 18 | 18 | 2601 | my $tag = shift; | |||
238 | 18 | 30 | my $attr = shift; | ||||
239 | |||||||
240 | 20 | 43 | my $attr_str = join ' ', map { $_ . '=' . do { | ||||
18 | 58 | ||||||
241 | 20 | 39 | my $val = $attr->{$_}; | ||||
242 | 20 | 82 | "\"$val\""; | ||||
243 | 18 | 28 | } } keys %{$attr}; | ||||
244 | |||||||
245 | 18 | 100 | 412 | return $attr_str ? "<$tag $attr_str>" : "<$tag>"; | |||
246 | } | ||||||
247 | |||||||
248 | |||||||
249 | ## | ||||||
250 | # _tag_close ($tag_name); | ||||||
251 | # ----------------------- | ||||||
252 | # Turns a structure representing an closing tag into | ||||||
253 | # a string representing a closing tag. | ||||||
254 | ## | ||||||
255 | sub _tag_close | ||||||
256 | { | ||||||
257 | 17 | 17 | 41 | my $tag = shift; | |||
258 | 17 | 57 | return "$tag>"; | ||||
259 | } | ||||||
260 | |||||||
261 | |||||||
262 | 1; | ||||||
263 | |||||||
264 | |||||||
265 | __END__ |