line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
/* vim: set sw=8 ts=8 si noet: */ |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
/* written by Guido Socher. |
4
|
|
|
|
|
|
|
* |
5
|
|
|
|
|
|
|
* This program is free software; you can redistribute it |
6
|
|
|
|
|
|
|
* and/or modify it under the same terms as Perl itself. |
7
|
|
|
|
|
|
|
*/ |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
/* read the following man pages to learn how to use XS and access perl from C: |
10
|
|
|
|
|
|
|
* perlxs Perl XS application programming interface |
11
|
|
|
|
|
|
|
* perlxstut Perl XS tutorial |
12
|
|
|
|
|
|
|
* perlguts Perl internal functions, variables, data structures for |
13
|
|
|
|
|
|
|
* C programmer |
14
|
|
|
|
|
|
|
* perlcall Perl calling conventions from C |
15
|
|
|
|
|
|
|
* perlapio IO abstraction interface |
16
|
|
|
|
|
|
|
* perlapi Perl C api |
17
|
|
|
|
|
|
|
*/ |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
// we use perlio not stdio: |
20
|
|
|
|
|
|
|
#define PERLIO_NOT_STDIO 0 |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
#ifdef __cplusplus |
23
|
|
|
|
|
|
|
extern "C" { |
24
|
|
|
|
|
|
|
#endif |
25
|
|
|
|
|
|
|
#include "EXTERN.h" |
26
|
|
|
|
|
|
|
#include "perl.h" |
27
|
|
|
|
|
|
|
#include "XSUB.h" |
28
|
|
|
|
|
|
|
#include |
29
|
|
|
|
|
|
|
#include |
30
|
|
|
|
|
|
|
#include |
31
|
|
|
|
|
|
|
#ifdef __cplusplus |
32
|
|
|
|
|
|
|
} |
33
|
|
|
|
|
|
|
#endif |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
/* tags longer than TAGREADER_MAX_TAGLEN produce a warning about |
37
|
|
|
|
|
|
|
* not terminated tags, must be much smaler than BUFFLEN */ |
38
|
|
|
|
|
|
|
#define TAGREADER_MAX_TAGLEN 400 |
39
|
|
|
|
|
|
|
/* BUFFLEN is the units in which we re-allocate mem, must be much bigger than |
40
|
|
|
|
|
|
|
* TAGREADER_MAX_TAGLEN */ |
41
|
|
|
|
|
|
|
#define BUFFLEN 6000 |
42
|
|
|
|
|
|
|
#define TAGREADER_TAGTYPELEN 25 |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
typedef struct trstuct{ |
45
|
|
|
|
|
|
|
char *filename; |
46
|
|
|
|
|
|
|
int fileline; |
47
|
|
|
|
|
|
|
int tagline; /* file line where the tag starts */ |
48
|
|
|
|
|
|
|
int charpos; /* character pos in the line */ |
49
|
|
|
|
|
|
|
int tagcharpos; /* character pos where tag starts */ |
50
|
|
|
|
|
|
|
int currbuflen; |
51
|
|
|
|
|
|
|
PerlIO *fd; |
52
|
|
|
|
|
|
|
char tagtype[TAGREADER_TAGTYPELEN + 1]; |
53
|
|
|
|
|
|
|
char *buffer; |
54
|
|
|
|
|
|
|
} *HTML__TagReader; |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
/* WIN32 stuff from: DH , |
57
|
|
|
|
|
|
|
* http://testers.cpan.org/ */ |
58
|
|
|
|
|
|
|
#ifdef WIN32 |
59
|
|
|
|
|
|
|
#define THEINLINE __forceinline |
60
|
|
|
|
|
|
|
#else |
61
|
|
|
|
|
|
|
#define THEINLINE inline |
62
|
|
|
|
|
|
|
#endif |
63
|
|
|
|
|
|
|
/* start of a html tag (first char in the tag) */ |
64
|
143
|
|
|
|
|
|
static THEINLINE int is_start_of_tag(int ch){ |
65
|
143
|
100
|
|
|
|
|
if (ch=='!' || ch=='/' || ch=='?' || isalnum(ch)){ |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
66
|
122
|
|
|
|
|
|
return(1); |
67
|
|
|
|
|
|
|
} |
68
|
21
|
|
|
|
|
|
return(0); |
69
|
|
|
|
|
|
|
} |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
MODULE = HTML::TagReader PACKAGE = HTML::TagReader PREFIX = tr_ |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
PROTOTYPES: ENABLE |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
HTML::TagReader |
76
|
|
|
|
|
|
|
tr_new(class, filename) |
77
|
|
|
|
|
|
|
SV *class |
78
|
|
|
|
|
|
|
SV *filename |
79
|
|
|
|
|
|
|
CODE: |
80
|
|
|
|
|
|
|
STRLEN i; // int |
81
|
|
|
|
|
|
|
char *str; |
82
|
3
|
50
|
|
|
|
|
if (!SvPOKp(filename)){ |
83
|
0
|
|
|
|
|
|
croak("ERROR: filename must be a string scalar"); |
84
|
|
|
|
|
|
|
} |
85
|
|
|
|
|
|
|
/* malloc and zero the struct */ |
86
|
3
|
|
|
|
|
|
Newz(0, RETVAL, 1, struct trstuct ); |
87
|
3
|
50
|
|
|
|
|
str=SvPV(filename,i); |
88
|
|
|
|
|
|
|
/* malloc */ |
89
|
3
|
|
|
|
|
|
New(0, RETVAL->filename, i+1, char ); |
90
|
3
|
|
|
|
|
|
strncpy(RETVAL->filename,str,i); |
91
|
|
|
|
|
|
|
/* malloc initial buffer */ |
92
|
3
|
|
|
|
|
|
New(0, RETVAL->buffer, BUFFLEN+1, char ); |
93
|
3
|
|
|
|
|
|
RETVAL->currbuflen=BUFFLEN; |
94
|
|
|
|
|
|
|
/* put a zero at the end of the string, perl might not do it */ |
95
|
3
|
|
|
|
|
|
*(RETVAL->filename + i )=(char)0; |
96
|
3
|
|
|
|
|
|
RETVAL->fd=PerlIO_open(str,"r"); |
97
|
3
|
50
|
|
|
|
|
if (RETVAL->fd == NULL){ |
98
|
0
|
|
|
|
|
|
croak("ERROR: Can not read file \"%s\" ",str); |
99
|
|
|
|
|
|
|
} |
100
|
3
|
|
|
|
|
|
RETVAL->charpos=0; |
101
|
3
|
|
|
|
|
|
RETVAL->tagcharpos=0; |
102
|
3
|
|
|
|
|
|
RETVAL->fileline=1; |
103
|
3
|
|
|
|
|
|
RETVAL->tagline=0; |
104
|
|
|
|
|
|
|
OUTPUT: |
105
|
|
|
|
|
|
|
RETVAL |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
HTML::TagReader |
108
|
|
|
|
|
|
|
tr_new_from_iofh(class, fh) |
109
|
|
|
|
|
|
|
SV *class |
110
|
|
|
|
|
|
|
PerlIO *fh |
111
|
|
|
|
|
|
|
CODE: |
112
|
|
|
|
|
|
|
STRLEN i; // int |
113
|
2
|
|
|
|
|
|
char str[]="iofh"; |
114
|
|
|
|
|
|
|
char c; |
115
|
2
|
50
|
|
|
|
|
if (fh == NULL){ |
116
|
0
|
|
|
|
|
|
croak("ERROR: invalid PerlIO fh"); |
117
|
|
|
|
|
|
|
} |
118
|
|
|
|
|
|
|
// let's do some test to see if we will be able to read on this io filehandle: |
119
|
2
|
|
|
|
|
|
c=PerlIO_getc(fh); |
120
|
|
|
|
|
|
|
// c is EOF in case of error or end of file |
121
|
2
|
50
|
|
|
|
|
if (c==EOF){ |
122
|
0
|
0
|
|
|
|
|
if (PerlIO_error(fh)){ |
123
|
0
|
|
|
|
|
|
croak("ERROR: can not read from IO filehandle"); |
124
|
|
|
|
|
|
|
} |
125
|
|
|
|
|
|
|
// no ungetc in case of EOF |
126
|
|
|
|
|
|
|
}else{ |
127
|
2
|
50
|
|
|
|
|
if (PerlIO_ungetc(fh,c)==EOF){ |
128
|
0
|
|
|
|
|
|
croak("ERROR: ungetc on filehandle failed"); |
129
|
|
|
|
|
|
|
} |
130
|
|
|
|
|
|
|
} |
131
|
2
|
|
|
|
|
|
i=strlen(str); |
132
|
|
|
|
|
|
|
// malloc and zero the struct |
133
|
2
|
|
|
|
|
|
Newz(0, RETVAL, 1, struct trstuct ); |
134
|
|
|
|
|
|
|
// malloc filename, we need it for some error printouts |
135
|
2
|
|
|
|
|
|
New(0, RETVAL->filename, i+1, char ); |
136
|
2
|
|
|
|
|
|
strncpy(RETVAL->filename,str,i); |
137
|
|
|
|
|
|
|
// put a zero at the end of the string, perl might not do it |
138
|
2
|
|
|
|
|
|
*(RETVAL->filename + i )=(char)0; |
139
|
|
|
|
|
|
|
// malloc initial buffer |
140
|
2
|
|
|
|
|
|
New(0, RETVAL->buffer, BUFFLEN+1, char ); |
141
|
2
|
|
|
|
|
|
RETVAL->currbuflen=BUFFLEN; |
142
|
2
|
|
|
|
|
|
RETVAL->fd=fh; |
143
|
2
|
|
|
|
|
|
RETVAL->charpos=0; |
144
|
2
|
|
|
|
|
|
RETVAL->tagcharpos=0; |
145
|
2
|
|
|
|
|
|
RETVAL->fileline=1; |
146
|
2
|
|
|
|
|
|
RETVAL->tagline=0; |
147
|
|
|
|
|
|
|
OUTPUT: |
148
|
|
|
|
|
|
|
RETVAL |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
void |
151
|
|
|
|
|
|
|
DESTROY(self) |
152
|
|
|
|
|
|
|
HTML::TagReader self |
153
|
|
|
|
|
|
|
CODE: |
154
|
5
|
|
|
|
|
|
Safefree(self->filename); |
155
|
5
|
|
|
|
|
|
Safefree(self->buffer); |
156
|
5
|
|
|
|
|
|
PerlIO_close(self->fd); |
157
|
5
|
|
|
|
|
|
Safefree(self); |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
void |
160
|
|
|
|
|
|
|
tr_gettag(self,showerrors) |
161
|
|
|
|
|
|
|
HTML::TagReader self |
162
|
|
|
|
|
|
|
SV *showerrors |
163
|
|
|
|
|
|
|
PREINIT: |
164
|
|
|
|
|
|
|
int bufpos; |
165
|
|
|
|
|
|
|
char ch; |
166
|
|
|
|
|
|
|
char chn; |
167
|
|
|
|
|
|
|
int state; |
168
|
|
|
|
|
|
|
PPCODE: |
169
|
9
|
50
|
|
|
|
|
if (! self->fileline){ |
170
|
0
|
|
|
|
|
|
croak("Object not initialized"); |
171
|
|
|
|
|
|
|
} |
172
|
|
|
|
|
|
|
/* initialize */ |
173
|
9
|
|
|
|
|
|
state=0; |
174
|
9
|
|
|
|
|
|
bufpos=0; |
175
|
9
|
|
|
|
|
|
ch=(char)0; |
176
|
9
|
|
|
|
|
|
chn=(char)0; |
177
|
9
|
|
|
|
|
|
self->tagline=self->fileline; |
178
|
|
|
|
|
|
|
/* find the next tag */ |
179
|
119
|
100
|
|
|
|
|
while(state != 3 && (chn=PerlIO_getc(self->fd))!=EOF ){ |
|
|
100
|
|
|
|
|
|
180
|
110
|
|
|
|
|
|
self->charpos++; |
181
|
110
|
100
|
|
|
|
|
if (ch==0){ /* read one more character ahead so we have always 2 */ |
182
|
8
|
|
|
|
|
|
ch=chn; |
183
|
8
|
|
|
|
|
|
continue; |
184
|
|
|
|
|
|
|
} |
185
|
|
|
|
|
|
|
/* we can not run out of mem because TAGREADER_MAX_TAGLEN |
186
|
|
|
|
|
|
|
* is much smaller than BUFFLEN */ |
187
|
102
|
50
|
|
|
|
|
if (bufpos > TAGREADER_MAX_TAGLEN){ |
188
|
0
|
0
|
|
|
|
|
if (SvTRUE(showerrors)){ |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
189
|
0
|
|
|
|
|
|
PerlIO_printf(PerlIO_stderr(),"%s:%d:%d: Warning, tag not terminated or too long.\n",self->filename,self->tagline,self->charpos); |
190
|
|
|
|
|
|
|
} |
191
|
0
|
|
|
|
|
|
self->buffer[bufpos]=ch;bufpos++; |
192
|
0
|
|
|
|
|
|
self->buffer[bufpos]=(char)0;bufpos++; |
193
|
0
|
|
|
|
|
|
state=3; |
194
|
0
|
|
|
|
|
|
continue; /* jump out of while */ |
195
|
|
|
|
|
|
|
} |
196
|
102
|
100
|
|
|
|
|
if (ch=='\n') { |
197
|
5
|
|
|
|
|
|
self->fileline++; |
198
|
5
|
|
|
|
|
|
self->charpos=0; |
199
|
|
|
|
|
|
|
} |
200
|
102
|
100
|
|
|
|
|
if (ch=='\n'|| ch=='\r' || ch=='\t' || ch==' ') { |
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
201
|
15
|
|
|
|
|
|
ch=' '; |
202
|
15
|
100
|
|
|
|
|
if (chn=='\n'|| chn=='\r' || chn=='\t' || chn==' '){ |
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
203
|
|
|
|
|
|
|
/* delete mupltiple spaces */ |
204
|
3
|
|
|
|
|
|
ch=chn; /* shift next char */ |
205
|
3
|
|
|
|
|
|
continue; |
206
|
|
|
|
|
|
|
} |
207
|
|
|
|
|
|
|
} |
208
|
99
|
|
|
|
|
|
switch (state) { |
209
|
|
|
|
|
|
|
/*---*/ |
210
|
|
|
|
|
|
|
case 0: |
211
|
|
|
|
|
|
|
/* outside of tag and we start tag here*/ |
212
|
32
|
100
|
|
|
|
|
if (ch=='<') { |
213
|
9
|
100
|
|
|
|
|
if (is_start_of_tag(chn)) { |
214
|
8
|
|
|
|
|
|
self->buffer[0]=(char)0; |
215
|
8
|
|
|
|
|
|
bufpos=0; |
216
|
8
|
|
|
|
|
|
self->tagcharpos=self->charpos; |
217
|
|
|
|
|
|
|
/*line where tag starts*/ |
218
|
8
|
|
|
|
|
|
self->tagline=self->fileline; |
219
|
8
|
|
|
|
|
|
self->buffer[bufpos]=ch;bufpos++; |
220
|
8
|
|
|
|
|
|
state=1; |
221
|
|
|
|
|
|
|
}else{ |
222
|
1
|
50
|
|
|
|
|
if (SvTRUE(showerrors)){ |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
223
|
0
|
|
|
|
|
|
PerlIO_printf(PerlIO_stderr(),"%s:%d:%d: Warning, single \'<\' should be written as <\n",self->filename,self->fileline,self->charpos); |
224
|
|
|
|
|
|
|
} |
225
|
|
|
|
|
|
|
} |
226
|
|
|
|
|
|
|
} |
227
|
32
|
|
|
|
|
|
break; |
228
|
|
|
|
|
|
|
/*---*/ |
229
|
|
|
|
|
|
|
case 1: |
230
|
52
|
|
|
|
|
|
self->buffer[bufpos]=ch;bufpos++; |
231
|
52
|
100
|
|
|
|
|
if (ch=='!' && chn=='-' && self->buffer[bufpos-2]=='<'){ |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
232
|
|
|
|
|
|
|
/* start of comment handling */ |
233
|
1
|
|
|
|
|
|
state=30; |
234
|
|
|
|
|
|
|
} |
235
|
52
|
100
|
|
|
|
|
if (ch=='>'){ |
236
|
6
|
|
|
|
|
|
state=3; /* note the exit state is hardcoded |
237
|
|
|
|
|
|
|
* as well in the while loop above */ |
238
|
6
|
|
|
|
|
|
self->buffer[bufpos]=(char)0;bufpos++; |
239
|
|
|
|
|
|
|
} |
240
|
52
|
50
|
|
|
|
|
if(ch=='<'){ |
241
|
|
|
|
|
|
|
/* the tag that we were reading was not terminated but instead we ge a new opening */ |
242
|
0
|
0
|
|
|
|
|
if (SvTRUE(showerrors)){ |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
243
|
0
|
|
|
|
|
|
PerlIO_printf(PerlIO_stderr(),"%s:%d:%d: Warning, \'>\' inside a tag should be written as >\n",self->filename,self->tagline,self->charpos); |
244
|
|
|
|
|
|
|
} |
245
|
0
|
|
|
|
|
|
state=1; |
246
|
0
|
|
|
|
|
|
bufpos=0; |
247
|
0
|
|
|
|
|
|
self->buffer[bufpos]=ch;bufpos++; |
248
|
0
|
|
|
|
|
|
self->tagline=self->fileline; |
249
|
|
|
|
|
|
|
} |
250
|
52
|
|
|
|
|
|
break; |
251
|
|
|
|
|
|
|
/*---*/ |
252
|
|
|
|
|
|
|
case 30: /*comment handling, |
253
|
|
|
|
|
|
|
*we have found " |