File Coverage

blib/lib/Biblio/bp/lib/bp-p-cs.pl

Criterion	Covered	Total	%
statement	10	136	7.3
branch	7	62	11.2
condition	0	9	0.0
subroutine	2	11	18.1
pod			n/a
total	19	218	8.7

line	stmt	bran	cond	sub	time	code
1						#
2						# bibliography package for Perl
3						#
4						# Character set common variables and routines
5						#
6						# Dana Jacobsen (dana@acm.org)
7						# 18 November 1995 (last modified 17 March 1996)
8
9						# for bib'nocharset which calls fromcanon:
10						require "bp-cs-none.pl";
11
12						######
13						#
14						# Return canonical character for a unicode hex string.
15						#
16						sub unicode_to_canon {
17	262			262	447	local($hex) = @_;
18
19	262				440	$hex =~ tr/a-f/A-F/;
20
21						# XXXXX Should we prepend '0' characters if we don't have 4 digits?
22	262	50			800	if ($hex !~ /^[\dA-F]{4}$/) {
23	0				0	&bib'gotwarn("Invalid Unicode character: $hex");
24	0				0	return '';
25						}
26	262	100			734	if ($hex =~ /00(..)/) {
27	188				718	return pack("C", hex($1));
28						}
29	74				208	return $bib'cs_ext . $hex;
30						}
31
32						sub canon_to_unicode {
33	0			0	0	local($can) = @_;
34	0				0	local($hex);
35
36	0	0			0	if (length($can) == 1) {
37	0				0	$hex = sprintf("%2lx", ord($can));
38	0				0	$hex =~ tr/a-f /A-F0/;
39	0				0	return( '00' . $hex );
40						}
41	0	0			0	if ($can =~ /$bib'cs_ext(....)/) {
42	0				0	$hex = $1;
43	0				0	$hex =~ tr/a-f/A-F/;
44	0				0	return $hex;
45						}
46	0	0			0	if ($can eq $bib'cs_char_escape) {
47	0				0	return &bib'canon_to_unicode($bib'cs_escape);
48						}
49	0				0	return &bib'gotwarn("Can't convert $can to Unicode");
50						}
51
52						sub decimal_to_unicode {
53	0			0	0	local($num) = @_;
54	0				0	local($hex);
55
56	0	0			0	if ($num < 256) {
		0
57	0				0	$hex = sprintf("00%2lx", $num);
58						} elsif ($num < 65536) {
59	0				0	local($div) = $num / 256;
60	0				0	local($high) = int($div);
61	0				0	local($low) = 256 * ($div - $high);
62	0				0	$hex = sprintf("%2lx%2lx", $high, $low);
63						} else {
64	0				0	return &bib'gotwarn("Illegal number $num given to decimal_to_unicode");
65						}
66	0				0	$hex =~ tr/a-f /A-F0/;
67	0				0	$hex;
68						}
69
70						sub unicode_to_decimal {
71	0			0	0	local($uni) = @_;
72
73	0	0			0	return &bib'gotwarn("Illegal unicode length: $uni") unless length($uni) == 4;
74	0	0			0	return &bib'gotwarn("Illegal unicode string: $uni") if $uni =~ /[^\da-fA-F]/;
75
76	0				0	hex($uni);
77						}
78
79						sub unicode_name {
80	0			0	0	local($hex) = @_;
81	0				0	local($name);
82
83						# For now, just print hex value
84	0				0	$name = "Unicode '$hex'";
85	0				0	$name;
86						}
87
88						sub meta_name {
89	0			0	0	local($hex) = @_;
90	0				0	local($name);
91
92						# For now, just print hex value
93	0				0	$name = "Meta '$hex'";
94	0				0	$name;
95						}
96
97						# Oh boy, this is getting really complicated.
98						#
99						# We have an approx table set up, which says that one can approximate XXXX
100						# by YYYY, where presumably YYYY is easier. There shouldn't be any loops,
101						# so programs can recurse through the table.
102						#
103						# That's for the meta codes. For the unicode approx, we just have a
104						# string. This allows multiple character approximations.
105						#
106						# XXXXX Think about C3's idea of multiple approximations.
107						#
108						# A map of 0000 means that it maps to the null string -- our "approximation"
109						# is to get rid of it. This is what we can do if it isn't terribly harmful
110						# to remove it.
111
112						sub meta_approx {
113	12			12	29	local($orig) = @_;
114
115	12	100			2384	require "${glb_bpprefix}p-cstab.pl" unless defined %bib'mapprox_tab;
116
117	12	50			47	if (defined $mapprox_tab{$orig}) {
118	12	50			55	return '' if $mapprox_tab{$orig} eq '0000';
119	0					return "${bib'cs_meta}$mapprox_tab{$orig}";
120						}
121	0					undef;
122						}
123
124						sub unicode_approx {
125	0			0		local($orig) = @_;
126
127						# XXXXX Should we warn them that they're getting an approx?
128
129	0	0				require "${glb_bpprefix}p-cstab.pl" unless defined %bib'uapprox_tab;
130
131	0					$uapprox_tab{$orig};
132						}
133
134						######
135						#
136						# Font change checker. Verifies and tries to correct font changes.
137						#
138						# After fonts are converted in your tocanon routine, call this. In your
139						# fromcanon routine, you can assume this has been called.
140						#
141						# XXXXX Should we call this in bp.pl's conversion routines? That would
142						# guarantee that it has been run. Right now, we let each module
143						# decide when and if it needs to be run.
144						#
145						# It takes a string that has font changes in it and makes sure they always
146						# match up and that there isn't an odd number (more starts than ends, more
147						# ends than starts).
148						#
149
150						sub font_check {
151	0			0		local($_) = @_;
152
153						# XXXXX Ought to read meta information from 00 or as input.
154	0	0				return $_ unless /${bib'cs_meta}01[01]/;
155
156	0					local(@fontstack) = ();
157	0					local($fontsmatch, $font, $pfont);
158
159						# Check for this special occurance: They don't have end fonts (or don't
160						# use them). They just make everything a begin font (troff often does this).
161						# Solution: Try to fix it up by replacing each begin after the first with
162						# an endprevious / begin pair. Then remove the last begin.
163	0	0				if (!/${bib'cs_meta}011/) {
164	0					local($pos) = $[;
165	0					local($lpos) = 0;
166	0					local($distance) = length($bib'cs_meta) + 3;
167	0					local($n) = 0;
168	0					while (($pos = index($_, "${bib'cs_meta}010", $pos)) >= $[) {
169	0					$n++;
170	0	0				if ($n == 1) {
171	0					$lpos = $pos;
172	0					$pfont = substr($_, $pos + $distance, 1);
173	0					$pos++;
174	0					next;
175						}
176	0					$lpos = $pos;
177	0					$font = substr($_, $pos + $distance, 1);
178	0					substr($_, $pos, 0) = "${bib'cs_meta}0110";
179	0					$pos += ($distance*2); # need to skip over the one we just put in.
180						}
181	0	0				if ($n > 1) {
		0
182						# now remove the last begin
183	0					substr($_, $lpos + $distance + 1, $distance + 1) = '';
184						} elsif ($n == 1) {
185						# only one begin? Add a previous to the end.
186	0					$_ .= "${bib'cs_meta}0110";
187						} else {
188	0					&panic("Bug in font_check, file bp-p-cs.");
189						}
190						#print STDERR "F: end of troff: $_\n";
191						# XXXXX XXF return $_;
192						}
193
194	0					do {
195						# We assume that everything is ok until something goes wrong.
196	0					$fontsmatch = 1;
197	0					while (/${bib'cs_meta}01(.)(.)/g) {
198	0					$font = $2;
199	0	0				if ($1 eq '0') { # font begin
200						#print STDERR "F: check begin font $font\n";
201	0	0				if ($font eq '0') {
202	0					&bib'gotwarn("Someone used default font begin. Naughty.");
203	0					s/${bib'cs_meta}0100/${bib'cs_meta}0110/go;
204	0					$fontsmatch = 0;
205	0					last;
206						}
207	0					push(@fontstack, $font);
208						} else { # font end
209						#print STDERR "F: check end font $font\n";
210	0	0				if (@fontstack) {
211	0					$pfont = pop(@fontstack);
212	0	0				next if $font eq '0'; # previous font. We don't care what it was.
213	0	0				if ($pfont ne $font) {
214						# _____ ended font that wasn't equal to the last begin
215	0					&bib'gotwarn("Nesting problem. Ended $font after $pfont");
216						# just make it end the previous one.
217	0	0				if ($] >= 5.000) {
218	0					s/(${bib'cs_meta}010$pfont)(.*?)${bib'cs_meta}011$font/$1$2{bib'cs_meta}011$pfont/;
219						} else {
220	0					s/(${bib'cs_meta}010$pfont)(.*)${bib'cs_meta}011$font/$1$2{bib'cs_meta}011$pfont/;
221						}
222	0					$fontsmatch = 0;
223	0					last;
224						}
225						} else {
226						# _____ end font used without a begin
227	0					&bib'gotwarn("Ended font $font before begin seen");
228						# This is really lousy, but without pulling the whole string apart,
229						# I can't do it properly.
230						# XXXXX Perhaps we should be pulling it apart with split?
231	0					s/${bib'cs_meta}011$font//;
232	0					$fontsmatch = 0;
233	0					last;
234						}
235						}
236						# .last statement of while
237						}
238	0	0	0			if ( $fontsmatch && (@fontstack != 0) ) {
239						# _____ too many begins
240
241						# XXXXX Is this loop needed, since we did this above?
242						# Try the simple case: They think roman is the default font, and begin
243						# it instead of ending their own. We'll treat this as a misunderstanding
244						# and won't create a warning.
245	0		0			while ( (@fontstack != 0)
			0
246						&& (@fontstack % 2 == 0)
247						&& ($fontstack[$#fontstack] eq '1') ) {
248	0					$pfont = pop(@fontstack);
249	0					$pfont = pop(@fontstack);
250						#print STDERR "F: Too many begins found roman & replacing with $pfont\n";
251	0	0				if ($] >= 5.000) {
252	0					s/(${bib'cs_meta}010$pfont)(.*?)${bib'cs_meta}0111/$1$2{bib'cs_meta}011$pfont/;
253						} else {
254	0					s/(${bib'cs_meta}010$pfont)(.*)${bib'cs_meta}0111/$1$2{bib'cs_meta}011$pfont/;
255						}
256						}
257	0					while (@fontstack != 0) {
258	0					$pfont = pop(@fontstack);
259	0					&bib'gotwarn("Began font $pfont, but never ended it!");
260						# just end them in order.
261						# XXXXX Should we use previous? Doesn't seem to matter.
262	0					$_ .= "${bib'cs_meta}011$pfont";
263						}
264						# Yes, $fontsmatch should be 0 at the end. We want one more pass through
265						# to validate the ordering. Our search and replace routines very well
266						# may find the wrong item!
267	0					$fontsmatch = 0;
268						# .last statement of if too many begins
269						}
270						} until $fontsmatch;
271
272	0					$_;
273						}
274
275						######
276						#
277						# This routine removes references to "font-previous" for those routines
278						# that don't support that notation. For instance, while troff has the
279						# \fP command, and TeX uses {\it foo}, HTML doesn't understand this. It
280						# needs a matching for every . So this routine will turn all
281						# references to the previous font into a "font-xxx-off" command.
282						#
283						# We assume that the string has been run through font_check already.
284						# Since this is called in a cs's fromcanon routine, that should be true.
285						#
286						sub font_noprev {
287	0			0		local($val) = @_;
288	0					local(@sval, @fontstack);
289	0					local($font, $pfont);
290	0					local($ret) = '';
291
292	0	0				return $val unless $val =~ /${bib'cs_meta}01/;
293
294	0					@sval = split(/${bib'cs_meta}01([01].)/, $val);
295	0					while (@sval) {
296	0					$ret .= shift @sval;
297	0	0				if (@sval) {
298	0					$font = shift @sval;
299	0	0				if ($font =~ /^0/) {
300						# font begin
301	0					push(@fontstack, $font);
302						} else {
303	0					$pfont = pop @fontstack;
304	0	0				if ($font =~ /^10/) {
305	0					$font = $pfont;
306	0					$font =~ s/^0/1/;
307						} else {
308	0	0				if (substr($font, 1, 1) ne substr($pfont, 1, 1)) {
309	0					&bib'gotwarn("Nesting problem in noprev. Ended $font after $pfont");
310						}
311						}
312						}
313	0					$ret .= "${bib'cs_meta}01$font";
314						}
315						}
316	0					$ret;
317						}
318
319
320						######
321						#
322						# This should strip off any special characters and replace them with either
323						# a simple form, or delete them. This will return ASCII text only, which
324						# is also equivilant to the 7 bit subset of ISO-8859-1.
325						#
326						# XXXXX I've tentatively decided that the actual code for this belongs in
327						# cs-none. The problem is that we then must load that file before
328						# running anything. It's a small file though.
329						# It should be required at the top of this file, so it's always loaded.
330						#
331						sub nocharset {
332	0			0		&bp_cs_none'fromcanon(@_);
333						}
334
335						1;