File Coverage

blib/lib/Biblio/bp/lib/bp-p-cs.pl
Criterion Covered Total %
statement 10 136 7.3
branch 7 62 11.2
condition 0 9 0.0
subroutine 2 11 18.1
pod n/a
total 19 218 8.7


line stmt bran cond sub pod time code
1             #
2             # bibliography package for Perl
3             #
4             # Character set common variables and routines
5             #
6             # Dana Jacobsen (dana@acm.org)
7             # 18 November 1995 (last modified 17 March 1996)
8              
9             # for bib'nocharset which calls fromcanon:
10             require "bp-cs-none.pl";
11              
12             ######
13             #
14             # Return canonical character for a unicode hex string.
15             #
16             sub unicode_to_canon {
17 262     262   447 local($hex) = @_;
18              
19 262         440 $hex =~ tr/a-f/A-F/;
20              
21             # XXXXX Should we prepend '0' characters if we don't have 4 digits?
22 262 50       800 if ($hex !~ /^[\dA-F]{4}$/) {
23 0         0 &bib'gotwarn("Invalid Unicode character: $hex");
24 0         0 return '';
25             }
26 262 100       734 if ($hex =~ /00(..)/) {
27 188         718 return pack("C", hex($1));
28             }
29 74         208 return $bib'cs_ext . $hex;
30             }
31              
32             sub canon_to_unicode {
33 0     0   0 local($can) = @_;
34 0         0 local($hex);
35              
36 0 0       0 if (length($can) == 1) {
37 0         0 $hex = sprintf("%2lx", ord($can));
38 0         0 $hex =~ tr/a-f /A-F0/;
39 0         0 return( '00' . $hex );
40             }
41 0 0       0 if ($can =~ /$bib'cs_ext(....)/) {
42 0         0 $hex = $1;
43 0         0 $hex =~ tr/a-f/A-F/;
44 0         0 return $hex;
45             }
46 0 0       0 if ($can eq $bib'cs_char_escape) {
47 0         0 return &bib'canon_to_unicode($bib'cs_escape);
48             }
49 0         0 return &bib'gotwarn("Can't convert $can to Unicode");
50             }
51              
52             sub decimal_to_unicode {
53 0     0   0 local($num) = @_;
54 0         0 local($hex);
55              
56 0 0       0 if ($num < 256) {
    0          
57 0         0 $hex = sprintf("00%2lx", $num);
58             } elsif ($num < 65536) {
59 0         0 local($div) = $num / 256;
60 0         0 local($high) = int($div);
61 0         0 local($low) = 256 * ($div - $high);
62 0         0 $hex = sprintf("%2lx%2lx", $high, $low);
63             } else {
64 0         0 return &bib'gotwarn("Illegal number $num given to decimal_to_unicode");
65             }
66 0         0 $hex =~ tr/a-f /A-F0/;
67 0         0 $hex;
68             }
69              
70             sub unicode_to_decimal {
71 0     0   0 local($uni) = @_;
72              
73 0 0       0 return &bib'gotwarn("Illegal unicode length: $uni") unless length($uni) == 4;
74 0 0       0 return &bib'gotwarn("Illegal unicode string: $uni") if $uni =~ /[^\da-fA-F]/;
75              
76 0         0 hex($uni);
77             }
78              
79             sub unicode_name {
80 0     0   0 local($hex) = @_;
81 0         0 local($name);
82              
83             # For now, just print hex value
84 0         0 $name = "Unicode '$hex'";
85 0         0 $name;
86             }
87              
88             sub meta_name {
89 0     0   0 local($hex) = @_;
90 0         0 local($name);
91              
92             # For now, just print hex value
93 0         0 $name = "Meta '$hex'";
94 0         0 $name;
95             }
96              
97             # Oh boy, this is getting really complicated.
98             #
99             # We have an approx table set up, which says that one can approximate XXXX
100             # by YYYY, where presumably YYYY is easier. There shouldn't be any loops,
101             # so programs can recurse through the table.
102             #
103             # That's for the meta codes. For the unicode approx, we just have a
104             # string. This allows multiple character approximations.
105             #
106             # XXXXX Think about C3's idea of multiple approximations.
107             #
108             # A map of 0000 means that it maps to the null string -- our "approximation"
109             # is to get rid of it. This is what we can do if it isn't terribly harmful
110             # to remove it.
111              
112             sub meta_approx {
113 12     12   29 local($orig) = @_;
114              
115 12 100       2384 require "${glb_bpprefix}p-cstab.pl" unless defined %bib'mapprox_tab;
116              
117 12 50       47 if (defined $mapprox_tab{$orig}) {
118 12 50       55 return '' if $mapprox_tab{$orig} eq '0000';
119 0           return "${bib'cs_meta}$mapprox_tab{$orig}";
120             }
121 0           undef;
122             }
123              
124             sub unicode_approx {
125 0     0     local($orig) = @_;
126              
127             # XXXXX Should we warn them that they're getting an approx?
128              
129 0 0         require "${glb_bpprefix}p-cstab.pl" unless defined %bib'uapprox_tab;
130              
131 0           $uapprox_tab{$orig};
132             }
133              
134             ######
135             #
136             # Font change checker. Verifies and tries to correct font changes.
137             #
138             # After fonts are converted in your tocanon routine, call this. In your
139             # fromcanon routine, you can assume this has been called.
140             #
141             # XXXXX Should we call this in bp.pl's conversion routines? That would
142             # guarantee that it has been run. Right now, we let each module
143             # decide when and if it needs to be run.
144             #
145             # It takes a string that has font changes in it and makes sure they always
146             # match up and that there isn't an odd number (more starts than ends, more
147             # ends than starts).
148             #
149              
150             sub font_check {
151 0     0     local($_) = @_;
152              
153             # XXXXX Ought to read meta information from 00 or as input.
154 0 0         return $_ unless /${bib'cs_meta}01[01]/;
155              
156 0           local(@fontstack) = ();
157 0           local($fontsmatch, $font, $pfont);
158              
159             # Check for this special occurance: They don't have end fonts (or don't
160             # use them). They just make everything a begin font (troff often does this).
161             # Solution: Try to fix it up by replacing each begin after the first with
162             # an endprevious / begin pair. Then remove the last begin.
163 0 0         if (!/${bib'cs_meta}011/) {
164 0           local($pos) = $[;
165 0           local($lpos) = 0;
166 0           local($distance) = length($bib'cs_meta) + 3;
167 0           local($n) = 0;
168 0           while (($pos = index($_, "${bib'cs_meta}010", $pos)) >= $[) {
169 0           $n++;
170 0 0         if ($n == 1) {
171 0           $lpos = $pos;
172 0           $pfont = substr($_, $pos + $distance, 1);
173 0           $pos++;
174 0           next;
175             }
176 0           $lpos = $pos;
177 0           $font = substr($_, $pos + $distance, 1);
178 0           substr($_, $pos, 0) = "${bib'cs_meta}0110";
179 0           $pos += ($distance*2); # need to skip over the one we just put in.
180             }
181 0 0         if ($n > 1) {
    0          
182             # now remove the last begin
183 0           substr($_, $lpos + $distance + 1, $distance + 1) = '';
184             } elsif ($n == 1) {
185             # only one begin? Add a previous to the end.
186 0           $_ .= "${bib'cs_meta}0110";
187             } else {
188 0           &panic("Bug in font_check, file bp-p-cs.");
189             }
190             #print STDERR "F: end of troff: $_\n";
191             # XXXXX XXF return $_;
192             }
193            
194 0           do {
195             # We assume that everything is ok until something goes wrong.
196 0           $fontsmatch = 1;
197 0           while (/${bib'cs_meta}01(.)(.)/g) {
198 0           $font = $2;
199 0 0         if ($1 eq '0') { # font begin
200             #print STDERR "F: check begin font $font\n";
201 0 0         if ($font eq '0') {
202 0           &bib'gotwarn("Someone used default font begin. Naughty.");
203 0           s/${bib'cs_meta}0100/${bib'cs_meta}0110/go;
204 0           $fontsmatch = 0;
205 0           last;
206             }
207 0           push(@fontstack, $font);
208             } else { # font end
209             #print STDERR "F: check end font $font\n";
210 0 0         if (@fontstack) {
211 0           $pfont = pop(@fontstack);
212 0 0         next if $font eq '0'; # previous font. We don't care what it was.
213 0 0         if ($pfont ne $font) {
214             # _____ ended font that wasn't equal to the last begin
215 0           &bib'gotwarn("Nesting problem. Ended $font after $pfont");
216             # just make it end the previous one.
217 0 0         if ($] >= 5.000) {
218 0           s/(${bib'cs_meta}010$pfont)(.*?)${bib'cs_meta}011$font/$1$2{bib'cs_meta}011$pfont/;
219             } else {
220 0           s/(${bib'cs_meta}010$pfont)(.*)${bib'cs_meta}011$font/$1$2{bib'cs_meta}011$pfont/;
221             }
222 0           $fontsmatch = 0;
223 0           last;
224             }
225             } else {
226             # _____ end font used without a begin
227 0           &bib'gotwarn("Ended font $font before begin seen");
228             # This is really lousy, but without pulling the whole string apart,
229             # I can't do it properly.
230             # XXXXX Perhaps we should be pulling it apart with split?
231 0           s/${bib'cs_meta}011$font//;
232 0           $fontsmatch = 0;
233 0           last;
234             }
235             }
236             # .last statement of while
237             }
238 0 0 0       if ( $fontsmatch && (@fontstack != 0) ) {
239             # _____ too many begins
240              
241             # XXXXX Is this loop needed, since we did this above?
242             # Try the simple case: They think roman is the default font, and begin
243             # it instead of ending their own. We'll treat this as a misunderstanding
244             # and won't create a warning.
245 0   0       while ( (@fontstack != 0)
      0        
246             && (@fontstack % 2 == 0)
247             && ($fontstack[$#fontstack] eq '1') ) {
248 0           $pfont = pop(@fontstack);
249 0           $pfont = pop(@fontstack);
250             #print STDERR "F: Too many begins found roman & replacing with $pfont\n";
251 0 0         if ($] >= 5.000) {
252 0           s/(${bib'cs_meta}010$pfont)(.*?)${bib'cs_meta}0111/$1$2{bib'cs_meta}011$pfont/;
253             } else {
254 0           s/(${bib'cs_meta}010$pfont)(.*)${bib'cs_meta}0111/$1$2{bib'cs_meta}011$pfont/;
255             }
256             }
257 0           while (@fontstack != 0) {
258 0           $pfont = pop(@fontstack);
259 0           &bib'gotwarn("Began font $pfont, but never ended it!");
260             # just end them in order.
261             # XXXXX Should we use previous? Doesn't seem to matter.
262 0           $_ .= "${bib'cs_meta}011$pfont";
263             }
264             # Yes, $fontsmatch should be 0 at the end. We want one more pass through
265             # to validate the ordering. Our search and replace routines very well
266             # may find the wrong item!
267 0           $fontsmatch = 0;
268             # .last statement of if too many begins
269             }
270             } until $fontsmatch;
271              
272 0           $_;
273             }
274              
275             ######
276             #
277             # This routine removes references to "font-previous" for those routines
278             # that don't support that notation. For instance, while troff has the
279             # \fP command, and TeX uses {\it foo}, HTML doesn't understand this. It
280             # needs a matching for every . So this routine will turn all
281             # references to the previous font into a "font-xxx-off" command.
282             #
283             # We assume that the string has been run through font_check already.
284             # Since this is called in a cs's fromcanon routine, that should be true.
285             #
286             sub font_noprev {
287 0     0     local($val) = @_;
288 0           local(@sval, @fontstack);
289 0           local($font, $pfont);
290 0           local($ret) = '';
291              
292 0 0         return $val unless $val =~ /${bib'cs_meta}01/;
293              
294 0           @sval = split(/${bib'cs_meta}01([01].)/, $val);
295 0           while (@sval) {
296 0           $ret .= shift @sval;
297 0 0         if (@sval) {
298 0           $font = shift @sval;
299 0 0         if ($font =~ /^0/) {
300             # font begin
301 0           push(@fontstack, $font);
302             } else {
303 0           $pfont = pop @fontstack;
304 0 0         if ($font =~ /^10/) {
305 0           $font = $pfont;
306 0           $font =~ s/^0/1/;
307             } else {
308 0 0         if (substr($font, 1, 1) ne substr($pfont, 1, 1)) {
309 0           &bib'gotwarn("Nesting problem in noprev. Ended $font after $pfont");
310             }
311             }
312             }
313 0           $ret .= "${bib'cs_meta}01$font";
314             }
315             }
316 0           $ret;
317             }
318              
319              
320             ######
321             #
322             # This should strip off any special characters and replace them with either
323             # a simple form, or delete them. This will return ASCII text only, which
324             # is also equivilant to the 7 bit subset of ISO-8859-1.
325             #
326             # XXXXX I've tentatively decided that the actual code for this belongs in
327             # cs-none. The problem is that we then must load that file before
328             # running anything. It's a small file though.
329             # It should be required at the top of this file, so it's always loaded.
330             #
331             sub nocharset {
332 0     0     &bp_cs_none'fromcanon(@_);
333             }
334              
335             1;