File Coverage

blib/lib/Alvis/QueryFilter.pm

Criterion	Covered	Total	%
statement	24	604	3.9
branch	0	208	0.0
condition	0	114	0.0
subroutine	8	41	19.5
pod	7	13	53.8
total	39	980	3.9

line	stmt	bran	cond	sub	pod	time	code
1							package Alvis::QueryFilter;
2
3							########################################################################
4							#
5							# A quick'n'dirty query filter
6							#
7							# -- Kimmo Valtonen
8							#
9							########################################################################
10
11	1			1		32235	use strict;
	1					2
	1					34
12	1			1		5	use warnings;
	1					2
	1					26
13
14	1			1		5	use Carp;
	1					6
	1					95
15	1			1		3463	use Data::Dumper;
	1					10678
	1					67
16
17	1			1		470	use Alvis::Treetagger;
	1					3
	1					33
18	1			1		874	use CQL::Parser;
	1					37598
	1					51
19	1			1		884	use URI::Escape;
	1					2761
	1					77
20
21	1			1		7	use strict;
	1					2
	1					7972
22
23							our $VERSION = '0.3';
24							our $verbose = 0;
25
26							my ($ERR_OK,
27							$ERR_CQL_PARSER_INST,
28							$ERR_XML_PARSER,
29							$ERR_NO_QUERY,
30							$ERR_CQL_PARSE,
31							$ERR_NONIMP_NODE_TYPE,
32							$ERR_NO_SEQ_LIST,
33							$ERR_TREETAGGER,
34							$ERR_LEMMA_DICT,
35							$ERR_APPLYING_TERM_NE,
36							$ERR_APPLYING_TYPING,
37							$ERR_APPLYING_ONTO,
38							$ERR_STRUCT2XML,
39							$ERR_CREATING_CAT_LIST,
40							$ERR_CREATING_SEQ_LIST,
41							$ERR_CREATING_CQL_TAIL,
42							$ERR_NEED_BOTH_TERM_AND_NE
43							)=(0..16);
44
45							my %ErrMsgs=($ERR_OK=>"",
46							$ERR_CQL_PARSER_INST=>"Instantiating CQL::Parser failed.",
47							$ERR_XML_PARSER=>"Instantiating the XML parser failed.",
48							$ERR_NO_QUERY=>"No query.",
49							$ERR_CQL_PARSE=>"CQL parsing failed.",
50							$ERR_NONIMP_NODE_TYPE=>'Non-implemented CQL node type.',
51							$ERR_NO_SEQ_LIST=>
52							"No current data structure representing the " .
53							"query expansion. No preceding UI2Zebra() " .
54							"call or we're out of sync. You'll get your " .
55							"money back.",
56							$ERR_TREETAGGER=>"Applying the treetagger failed.",
57							$ERR_LEMMA_DICT=>"Applying the lemma dictionary failed.",
58							$ERR_APPLYING_TERM_NE=>"Applying the terms and NEs failed.",
59							$ERR_APPLYING_TYPING=>"Applying the typing rules failed.",
60							$ERR_APPLYING_ONTO=>"Applying the ontology mappings failed.",
61							$ERR_STRUCT2XML=>"Converting the data structure into XML failed.",
62							$ERR_CREATING_CAT_LIST=>
63							"Extracting category part of query failed.",
64							$ERR_CREATING_SEQ_LIST=>
65							"Creating the list of possible expansions failed.",
66							$ERR_CREATING_CQL_TAIL=>"Converting the data structure to a CQL " .
67							"tail failed.",
68							$ERR_NEED_BOTH_TERM_AND_NE=>"We need both a term and a NE dictionary."
69							);
70
71							sub new
72							{
73	0			0	1		my $proto=shift;
74
75	0		0				my $class=ref($proto)\|\|$proto;
76	0		0				my $parent=ref($proto)&&$proto;
77	0						my $self={};
78	0						bless($self,$class);
79
80	0						$self->_set_err_state($ERR_OK);
81
82	0						$self->_init(@_);
83
84	0						$self->{CQLParser}=new CQL::Parser();
85	0	0					if (!defined($self->{CQLParser}))
86							{
87	0						$self->_set_err_state($ERR_CQL_PARSER_INST);
88	0						return undef;
89							}
90
91	0						$Alvis::Treetagger::verbose = $verbose;
92	0						&Alvis::Treetagger::reopen();
93	0						$self->{termMaxLen} = 0;
94	0						$self->{textFields} = "text";
95	0						$self->{tcanon} = \&canonise_def;
96	0						$self->{ncanon} = \&canonise_def;
97							# this must match the lemma indexing rules in alvis2index.xsl
98	0						$self->{lemmaSearch} = "^[VNJ]";
99
100
101	0						return $self;
102							}
103
104							sub _init
105							{
106	0			0			my $self=shift;
107
108	0						$self->{keepLemmas}=1;
109
110	0	0					if (defined(@_))
111							{
112	0						my %args=@_;
113	0						@$self{ keys %args }=values(%args);
114							}
115
116							}
117
118							sub _set_err_state
119							{
120	0			0			my $self=shift;
121	0						my $errcode=shift;
122	0						my $errmsg=shift;
123
124	0	0					if (!defined($errcode))
125							{
126	0						confess("set_err_state() called with an undefined argument.");
127							}
128
129	0	0					if (exists($ErrMsgs{$errcode}))
130							{
131	0	0					if ($errcode==$ERR_OK)
132							{
133	0						$self->{errstr}="";
134							}
135							else
136							{
137	0						$self->{errstr}.=" " . $ErrMsgs{$errcode};
138	0	0					if (defined($errmsg))
139							{
140	0						$self->{errstr}.=" " . $errmsg;
141							}
142							}
143							}
144							else
145							{
146	0						confess("Internal error: set_err_state() called with an " .
147							"unrecognized argument ($errcode).")
148							}
149							}
150
151							sub errmsg
152							{
153	0			0	0		my $self=shift;
154
155	0						return $self->{errstr};
156							}
157
158							############################################################################
159							#
160							# Public methods
161							#
162							############################################################################
163
164							sub set_lemma
165							{
166	0			0	1		my $self=shift;
167	0						$self->{lemmaSearch} = shift;
168							}
169
170							sub set_text_fields
171							{
172	0			0	1		my $self=shift;
173	0						my $fields=shift;
174	0						$fields =~ s/\s+/ /g;
175	0						$fields =~ s/"//g;
176	0						$self->{textFields} = $fields;
177							}
178							sub set_canon
179							{
180	0			0	1		my $self=shift;
181	0						$self->{tcanon} = shift();
182	0						$self->{ncanon} = shift();
183							}
184							sub read_dicts
185							{
186	0			0	1		my $self=shift;
187	0						my $lemma_dict_f=shift;
188	0						my $term_dict_f=shift;
189	0						my $NE_dict_f=shift;
190	0						my $typing_rules_f=shift;
191	0						my $onto_nodes_f=shift;
192	0						my $onto_mapping_f=shift;
193
194	0	0					if (defined($lemma_dict_f))
195							{
196	0						$self->{lemma_dict}=$self->_read_lemma_dict($lemma_dict_f);
197							}
198							else
199							{
200	0						undef $self->{lemma_dict};
201							}
202	0	0					if (defined($term_dict_f))
203							{
204	0						$self->{term_dict}=$self->_read_term_dict($term_dict_f);
205							}
206							else
207							{
208	0						undef $self->{term_dict};
209							}
210	0	0					if (defined($NE_dict_f))
211							{
212	0						$self->{NE_dict}=$self->_read_NE_dict($NE_dict_f);
213							}
214							else
215							{
216	0						undef $self->{NE_dict};
217							}
218	0	0					if (defined($typing_rules_f))
219							{
220	0						$self->{typing_rules}=$self->_read_typing_rules($typing_rules_f);
221							}
222							else
223							{
224	0						undef $self->{typing_rules};
225							}
226	0	0					if (defined($onto_nodes_f))
227							{
228	0						$self->{onto_nodes}=$self->_read_onto_nodes($onto_nodes_f);
229							}
230							else
231							{
232	0						undef $self->{onto_nodes};
233							}
234	0	0					if (defined($onto_mapping_f))
235							{
236	0						$self->{onto_paths}=$self->_read_onto_mapping($onto_mapping_f);
237							}
238							else
239							{
240	0						undef $self->{onto_paths};
241							}
242
243							# print STDERR " Term dict. check: 'Northern blot' -> " .
244							# $self->{term_dict}->{&canonise_def('Northern blot')} . "\n";
245
246	0						return 1;
247							}
248
249							sub cleanspaces() {
250	0			0	0		$_ = shift();
251	0						s/\s+/ /g;
252	0						s/^ //g;
253	0						s/ $//g;
254	0						return $_;
255							}
256
257							sub _read_lemma_dict
258							{
259	0			0			my $self=shift;
260	0						my $f=shift;
261
262	0						my %dict=();
263
264	0	0					if (!defined(open(F,"<:utf8",$f)))
265							{
266	0						return undef;
267							}
268
269	0						while (my $l=)
270							{
271	0						chomp $l;
272	0						my ($form,$lemma,$pos)=split(/\t/,$l,-1);
273	0						$form = &cleanspaces($form);
274	0						$dict{lc($form)}{lemma}=&cleanspaces($lemma);
275	0						$dict{lc($form)}{POS}=$pos;
276							}
277
278	0						close(F);
279
280	0						return \%dict;
281							}
282
283							# default method to standardise terms and named entities
284							# lower case, ignore space and '-'
285							sub canonise_def {
286	0			0	0		$_ = shift();
287	0						s/\s+//g;
288	0						s/\-//g;
289	0						$_ = lc($_);
290							}
291
292							sub _read_term_dict
293							{
294	0			0			my $self=shift;
295	0						my $f=shift;
296
297	0						my %dict=();
298	0						my $term_max_len = 0;
299
300	0	0					if (!defined(open(F,"<:utf8",$f)))
301							{
302	0						return undef;
303							}
304
305	0						while (my $l=)
306							{
307	0						chomp $l;
308	0						my ($form,$can)=split(/\t/,$l,-1);
309	0						$form = &cleanspaces($form);
310	0						$can = &cleanspaces($can);
311	0						my $cf = &{$self->{tcanon}}($form);
	0
312	0	0	0				if ( $verbose && defined($dict{$cf}) && $dict{$cf} ne $can ) {
			0
313	0						print STDERR "Term of form '$form' has canonical form '$can'\n"
314							. " but maps to the another canonical form '$dict{$cf}'\n";
315							}
316	0						$dict{$cf}=$can;
317	0						my @tt = split(/ /,$form);
318	0	0					if ( scalar(@tt)> $term_max_len) {
319	0						$term_max_len = scalar(@tt);
320							}
321							}
322	0	0					if ( $self->{termMaxLen}<$term_max_len ) {
323	0						$self->{termMaxLen} = $term_max_len;
324							}
325
326	0						close(F);
327
328	0						return \%dict;
329							}
330
331							sub _read_NE_dict
332							{
333	0			0			my $self=shift;
334	0						my $f=shift;
335
336	0						my %dict=();
337	0						my $term_max_len = 0;
338
339	0	0					if (!defined(open(F,"<:utf8",$f)))
340							{
341	0						return undef;
342							}
343
344	0						while (my $l=)
345							{
346	0						chomp $l;
347	0						my ($form,$can)=split(/\t/,$l,-1);
348	0						$form = &cleanspaces($form);
349	0						$can = &cleanspaces($can);
350	0						my $cf = &{$self->{ncanon}}($form);
	0
351	0	0	0				if ( $verbose && defined($dict{$cf}) && $dict{$cf} ne $can ) {
			0
352	0						print STDERR "NE of form '$form' has canonical form '$can'\n"
353							. " but maps to the another canonical form '$dict{$cf}'\n";
354							}
355	0						$dict{$cf}=$can;
356	0						my @tt = split(/\s+/,$form);
357	0	0					if ( scalar(@tt)> $term_max_len) {
358	0						$term_max_len = scalar(@tt);
359							}
360							}
361	0	0					if ( $self->{termMaxLen}<$term_max_len ) {
362	0						$self->{termMaxLen} = $term_max_len;
363							}
364
365	0						close(F);
366
367	0						return \%dict;
368							}
369
370							sub _read_typing_rules
371							{
372	0			0			my $self=shift;
373	0						my $f=shift;
374
375	0						my %dict=();
376
377	0	0					if (!defined(open(F,"<:utf8",$f)))
378							{
379	0						return undef;
380							}
381
382	0						while (my $l=)
383							{
384	0						chomp $l;
385	0						my ($form,$type)=split(/\t/,$l,-1);
386	0						$form = &cleanspaces($form);
387	0						$type = &cleanspaces($type);
388	0						$dict{$form}=$type;
389							}
390
391	0						close(F);
392
393	0						return \%dict;
394							}
395
396							sub _read_onto_nodes
397							{
398	0			0			my $self=shift;
399	0						my $f=shift;
400
401	0						my %dict=();
402
403	0	0					if (!defined(open(F,"<:utf8",$f)))
404							{
405	0						return undef;
406							}
407
408	0						while (my $l=)
409							{
410	0						chomp $l;
411	0						my ($form,$onto_node)=split(/\t/,$l,-1);
412	0						$form = &cleanspaces($form);
413	0						$onto_node = &cleanspaces($onto_node);
414	0						$dict{$form}=$onto_node;
415							}
416
417	0						close(F);
418
419	0						return \%dict;
420							}
421
422							sub _read_onto_mapping
423							{
424	0			0			my $self=shift;
425	0						my $f=shift;
426
427	0						my %dict=();
428
429	0	0					if (!defined(open(F,"<:utf8",$f)))
430							{
431	0						return undef;
432							}
433
434	0						while (my $l=)
435							{
436	0						chomp $l;
437	0						my ($node,$path)=split(/\t/,$l,-1);
438	0						$node = &cleanspaces($node);
439	0						$path = &cleanspaces($path);
440	0						$dict{$node}=$path;
441							}
442
443	0						close(F);
444
445	0						return \%dict;
446							}
447
448							sub transform # just for testing and debugging
449							{
450	0			0	0		my $self=shift;
451	0						my $query=shift; # list of word forms
452
453	0						my $expanded_query_struct=$self->_expand_qword_list($query);
454
455	0						$self->{queryForm} = $query;
456	0						$self->{finalForm} = "";
457
458	0						my $query_XML=$self->_data_struct2XML($expanded_query_struct);
459
460	0						return $query_XML;
461							}
462
463							#
464							# Given a list of word forms, expand
465							#
466							sub _expand_qword_list
467							{
468	0			0			my $self=shift;
469	0						my $query=shift; # list of word forms
470
471							# print STDERR "Q: " . Dumper($query) . "\n";
472
473	0						my $lemmatized_by_tagger=$self->_apply_treetagger($query);
474	0	0					if (!defined($lemmatized_by_tagger))
475							{
476	0						$self->_set_err_state($ERR_TREETAGGER);
477	0						return undef;
478							}
479
480							# print STDERR "LEM: " . Dumper($lemmatized_by_tagger) . "\n";
481
482	0						my $lemmatized=
483							$self->_apply_lemma_dict($lemmatized_by_tagger); # if one exists
484	0	0					if (!defined($lemmatized))
485							{
486	0						$self->_set_err_state($ERR_LEMMA_DICT);
487	0						return undef;
488							}
489
490							# print STDERR "LEMTAG: " . Dumper($lemmatized) . "\n";
491
492	0						my $term_NE_expanded=$self->_apply_terms_and_NEs($lemmatized);
493	0	0					if (!defined($term_NE_expanded))
494							{
495	0						$self->_set_err_state($ERR_APPLYING_TERM_NE);
496	0						return undef;
497							}
498							# print STDERR "TERM: " . Dumper($term_NE_expanded) . "\n";
499
500
501	0						my $typing_expanded=$self->_apply_typing_rules($term_NE_expanded);
502	0	0					if (!defined($typing_expanded))
503							{
504	0						$self->_set_err_state($ERR_APPLYING_TYPING);
505	0						return undef;
506							}
507
508	0						my $onto_expanded=$self->_apply_onto($typing_expanded);
509	0	0					if (!defined($onto_expanded))
510							{
511	0						$self->_set_err_state($ERR_APPLYING_ONTO);
512	0						return undef;
513							}
514							# print STDERR "FINAL: " . Dumper($onto_expanded) . "\n";
515
516	0						return $onto_expanded;
517							}
518
519							# extract query from SRU
520							sub UI2Query
521							{
522	0			0	0		my $self=shift;
523	0						my $SRU=shift;
524	0	0					if ( /&query=([^\&]*)/ ) {
525	0						return $1;
526							}
527	0						return "";
528							}
529
530							#
531							# UI ---> Zebra middle man
532							#
533							sub UI2Zebra
534							{
535	0			0	1		my $self=shift;
536	0						my $SRU=shift;
537
538	0						my @expanded_SRU=();
539
540							# extract the query
541	0						my $query;
542	0						my @p=split(/\&/,$SRU,-1);
543	0						for my $p (@p)
544							{
545	0	0					if ($p=~/^query=(.*)$/)
546							{
547	0						$query=$1;
548							}
549							else
550							{
551	0						push(@expanded_SRU,$p); # so we can reconstruct
552							}
553							}
554	0	0					if (!defined($query))
555							{
556	0						$self->_set_err_state($ERR_NO_QUERY,"SRU:\"$SRU\"");
557	0						return undef;
558							}
559	0						$self->{queryForm} = $query;
560	0						$self->{queryForm} =~ s/\&/\&/g;
561	0						$self->{queryForm} =~ s/
562	0						$self->{queryForm} =~ s/>/\>/g;
563	0						$self->{finalForm} = "";
564
565							# decode percentage notation
566	0						my $query_copy=$query;
567	0						$query_copy=uri_unescape($query_copy);
568
569							# parse the CQL
570	0						my $parse_tree;
571							eval
572	0						{
573	0						$parse_tree=$self->{CQLParser}->parse($query_copy);
574							};
575	0	0					if ($@)
576							{
577	0						chomp($query);
578	0						$@=~s/(.) at . line [0-9]+\n/$1/o;
579	0						$self->_set_err_state($ERR_CQL_PARSE,"Query:\"$query\".");
580	0						return undef;
581							}
582
583							# Get a list of all possible text query word sequences (so this is
584							# implicitly an OR of them)
585							#
586	0						my $t_qwords=[[]]; # help variable used in the recursion
587	0						my $seq_list=
588							$self->_get_text_qwords($parse_tree,$t_qwords);
589	0	0					if (!defined($seq_list))
590							{
591	0						$self->_set_err_state($ERR_CREATING_SEQ_LIST);
592	0						return undef;
593							}
594
595							# Get the categorising tail anded to the end
596							#
597	0						my $cats=&get_categories($parse_tree);
598	0	0					if (!defined($cats))
599							{
600	0						$self->_set_err_state($ERR_CREATING_CAT_LIST);
601	0						return undef;
602							}
603
604							# Important! Used in the following Zebra2UI, 'cause the
605							# SRU response has nothing about the query.
606							# So...if used out of sync/with more than one client...kaboom!
607							# Not my problem.
608	0						$self->{currSeqList}=$seq_list;
609
610							# print STDERR "Term elements: " . Dumper($seq_list) . "\n";
611
612							#
613							# Ok, create the 'tail' i.e. what we AND to the original query
614							# as an OR of possible expansions
615							#
616	0						my $CQL_tail=$self->_data_struct2CQLtail($seq_list);
617	0	0					if (!defined($CQL_tail))
618							{
619	0						$self->_set_err_state($ERR_CREATING_CQL_TAIL);
620	0						return undef;
621							}
622
623							# print STDERR "QQ##$query##$cats##$CQL_tail\n";
624
625							#$query='%28' . $query . '%29%20and%20' $CQL_tail;
626	0						$query=$CQL_tail;
627	0	0					if ( $cats ) {
628	0						$query .= '%20and%20' . $cats;
629							}
630
631	0						push(@expanded_SRU,"query=$query");
632
633	0						$self->{finalForm} = $query;
634	0						$self->{finalForm} =~ s/\&/\&/g;
635	0						$self->{finalForm} =~ s/
636	0						$self->{finalForm} =~ s/>/\>/g;
637
638	0						return join('&',@expanded_SRU);
639							}
640
641							#
642							# Zebra ---> UI middle man
643							#
644							sub Zebra2UI
645							{
646	0			0	1		my $self=shift;
647	0						my $SRU_response=shift;
648
649							# We need to know what the query was! It's not in the response.
650							# Of course this is bloody dangerous if we get out of sync or
651							# have more than 1 client.
652							#
653	0	0					if (!defined($self->{currSeqList}))
654							{
655	0						$self->_set_err_state($ERR_NO_SEQ_LIST);
656	0						return undef;
657							}
658
659							#
660							# Just convert it to our XML format to put into extraResponseData
661							# I chose to just catenate elements as an implicit OR..
662							#
663	0						my $query_XML=$self->_data_struct2XML($self->{currSeqList});
664	0	0					if (!defined($query_XML))
665							{
666	0						$self->_set_err_state($ERR_STRUCT2XML);
667	0						return undef;
668							}
669
670	0						${$SRU_response} =~ s/<\/zs:searchRetrieveResponse>/$query_XML<\/zs:extraResponseData><\/zs:searchRetrieveResponse>/;
	0
671
672	0						return 1;
673							}
674
675
676							#
677							# Recursive CQL parse tree traversal, results in picking out the relevant
678							# text query words in order. Too tired to explain.
679							#
680							sub _get_text_qwords
681							{
682	0			0			my $self=shift;
683	0						my $CQL_parse_node=shift;
684	0						my $text_qwords=shift;
685
686	0						my ($text_qwords_l,$text_qwords_r);
687
688							# print STDERR "ENTRY:",Dumper($text_qwords);
689
690	0	0					if ($CQL_parse_node->isa("CQL::AndNode"))
		0
		0
		0
691							{
692							# warn "AND";
693	0						$text_qwords_l=$self->_get_text_qwords($CQL_parse_node->left(),
694							$text_qwords);
695	0						$text_qwords_r=$self->_get_text_qwords($CQL_parse_node->right(),
696							$text_qwords_l);
697
698	0						return $text_qwords_r;
699							}
700							elsif ($CQL_parse_node->isa("CQL::OrNode"))
701							{
702							# warn "OR";
703
704	0						$text_qwords_l=$self->_get_text_qwords($CQL_parse_node->left(),
705							$text_qwords);
706	0						$text_qwords_r=$self->_get_text_qwords($CQL_parse_node->right(),
707							$text_qwords);
708
709	0						return [@$text_qwords_l,@$text_qwords_r];
710							}
711							elsif ($CQL_parse_node->isa("CQL::NotNode"))
712							{
713	0						$text_qwords_l=$self->_get_text_qwords($CQL_parse_node->left(),
714							$text_qwords);
715
716	0						return $text_qwords_l;
717
718							}
719							elsif ($CQL_parse_node->isa("CQL::TermNode"))
720							{
721							# warn "TERM";
722
723	0						my $qualifier=$CQL_parse_node->getQualifier();
724	0						my ($index_set_name,$index_name)=split(/\./,$qualifier);
725
726	0	0					if (!defined($index_name))
727							{
728	0						$index_set_name=$self->{indexSetName};
729	0						$index_name=$qualifier;
730							}
731
732	0						my $term=$CQL_parse_node->getTerm();
733
734							# Our partial hack solution: if it contains a space, leave as is.
735							# Wray's hack on hack - keep space stuff, and deal with it differently
736	0						if ( 1 \|\| ( $term!~/\s/) )
737							{
738	0						$term =~ s/\s+/\#\#/g;
739	0	0	0				if ($qualifier eq 'text' \|\| $qualifier eq 'srw.ServerChoice')
740							{
741	0						my @update=();
742	0						for my $qwords (@$text_qwords)
743							{
744	0						my @qw=@$qwords;
745	0						push(@qw,$term);
746	0						push(@update,[@qw]);
747							}
748	0						return \@update;
749							}
750							}
751							}
752
753	0						return $text_qwords;
754							}
755
756							#
757							# Recursive CQL parse tree traversal, results in picking out
758							# the ANDed category part at the end
759							#
760							sub get_categories
761							{
762	0			0	0		my $CQL_parse_node=shift;
763
764	0						my ($text_catq_l,$text_catq_r);
765
766							# print STDERR "ENTRY:",Dumper($text_catq);
767
768	0	0					if ($CQL_parse_node->isa("CQL::AndNode"))
		0
		0
		0
769							{
770							# warn "AND";
771	0						$text_catq_l=&get_categories($CQL_parse_node->left());
772	0						$text_catq_r=&get_categories($CQL_parse_node->right());
773	0	0	0				if ( $text_catq_l && $text_catq_r ) {
774	0						return $text_catq_l . ' and ' .$text_catq_r;
775							}
776	0						return $text_catq_l . $text_catq_r;
777							}
778							elsif ($CQL_parse_node->isa("CQL::OrNode"))
779							{
780	0						return "";
781							}
782							elsif ($CQL_parse_node->isa("CQL::NotNode"))
783							{
784	0						return "";
785
786							}
787							elsif ($CQL_parse_node->isa("CQL::TermNode"))
788							{
789							# warn "TERM";
790
791	0						my $qualifier=$CQL_parse_node->getQualifier();
792
793							# Our partial hack solution: if it contains a space, leave as is.
794							# Wray's hack on hack - keep space stuff, and deal with it differently
795
796	0	0	0				if ($qualifier ne 'text' && $qualifier ne 'srw.ServerChoice') {
797	0						return $CQL_parse_node->toCQL();
798							} else {
799	0						return "";
800							}
801							}
802
803	0						return "";
804							}
805
806							#
807							# Converts our expansion data structure to a CQL "tail"
808							#
809							sub _data_struct2CQLtail
810							{
811	0			0			my $self=shift;
812	0						my $seq_list=shift;
813
814	0						my $query;
815	0						my @seq_items=();
816	0						for my $seq (@$seq_list)
817							{
818	0						my $ds=$self->_expand_qword_list($seq);
819
820	0						my @items=();
821
822	0						for (my $i=0;$i
823							{
824	0						my ($token,$POS,$lemma,$max_type,$match_can_form,$pathtype)
825	0						= @{$ds->[$i]};
826
827	0	0	0				if ( $POS eq 'INDEX' && $token =~ /^([a-z0-9\-\_\.]+)=(.*)/ ) {
		0	0
		0	0
828	0						push(@items,"$1%3D%22$2%22");
829							} elsif (defined($max_type))
830							{
831	0	0					if ($max_type eq 'term_dict')
		0
832							{
833	0						my $surface_form=$token;
834	0						my $can_form=$match_can_form;
835	0						my $onto=$pathtype;
836	0						my $j;
837	0						for ($j=$i+1;$j
838							{
839	0						my ($token,$POS,$lemma,$max_type,$match_can_form,
840							$onto_path)
841	0						=@{$ds->[$j]};
842	0	0	0				if (!defined($max_type) \|\| $max_type ne 'term_dict'
			0
			0
843							\|\| !defined($match_can_form) \|\| $can_form ne $match_can_form )
844							{
845	0						last;
846							}
847	0						$surface_form.=" $token";
848	0						$onto=$onto_path;
849							}
850
851	0	0	0				if ( defined($onto) && $onto ne "" )
852							{
853	0						push(@items,"term%3D%22$onto$can_form%22"); # unclear
854							}
855							else
856							{
857	0						push(@items,"term%3D%22$can_form%22");
858							}
859
860	0						$i=$j-1;
861							}
862							elsif ($max_type eq 'NE_dict')
863							{
864	0						my $surface_form=$token;
865	0						my $can_form=$match_can_form;
866	0						my $type=$pathtype;
867	0						my $j;
868	0						for ($j=$i+1;$j
869							{
870	0						my ($token,$POS,$lemma,$max_type,$match_can_form,$NE_type)
871	0						=@{$ds->[$j]};
872	0	0	0				if (!defined($max_type) \|\| $max_type ne 'NE_dict'
			0
			0
873							\|\| !defined($match_can_form) \|\| $can_form ne $match_can_form )
874							{
875	0						last;
876							}
877	0						$surface_form.=" $token";
878	0						$type = $NE_type;
879							}
880
881	0	0	0				if ( ! defined($type) \|\| $type eq "" ) {
		0
882	0						push(@items,"entity%3D%22$can_form%22");
883							} elsif ( $type !~ /\// ) {
884	0						push(@items,"entity-$type%3D%22$can_form%22");
885							} else {
886	0						push(@items,"entity%3D%22$type$can_form%22");
887							}
888
889
890	0						$i=$j-1;
891							}
892							}
893							elsif (defined($lemma) && $POS =~ /$self->{lemmaSearch}/o
894							&& $self->{keepLemmas})
895							{
896	0						push(@items,"lemma%3D%22$lemma%22");
897							}
898							else {
899	0						push(@items,$self->_make_term($token));
900							}
901							}
902
903	0						push(@seq_items,"%28" . join('%20and%20',@items) . "%29");
904							}
905	0	0					if ( scalar(@seq_items) <= 1 ) {
906	0						$query = $seq_items[0];
907							} else {
908	0						$query = "%28" . join('%20or%20',@seq_items) . "%29";
909							}
910
911	0						return $query;
912							}
913
914
915							sub _make_term
916							{
917	0			0			my $self=shift;
918	0						my $term=shift;
919	0	0	0				if ( $term !~/^\"/ \|\| $term !~/\"$/ ) {
920	0						$term="\"$term\"";
921							}
922	0	0					if ( $self->{textFields} =~/ / ) {
923	0						my $result = "";
924	0						foreach my $f ( split(/ /,$self->{textFields}) ) {
925	0						$result .= " or $f%3D$term";
926							}
927	0						$result =~ s/^ or //;
928	0						return "($result)";
929							}
930	0						return $self->{textFields} . "%3D$term";
931							}
932
933							#
934							# Converts our expansion data structure to XML that fits extraResponseData
935							#
936							sub _data_struct2XML
937							{
938	0			0			my $self=shift;
939	0						my $seq_list=shift;
940
941	0						my $XML = "\n " . $self->{queryForm} . "\n";
942
943							# Why was this here in the first place?
944							# $XML.="\n";
945
946	0						my @seq_items=();
947	0						for my $seq (@$seq_list)
948							{
949	0						my $ds=$self->_expand_qword_list($seq);
950
951	0						$XML.="
952	0						$XML.=" form=\"" . join(' ',@$seq) . "\" >\n";
953
954	0						for (my $i=0;$i
955							{
956	0						my ($token,$POS,$lemma,$max_type,$match_can_form,$pathtype)
957	0						= @{$ds->[$i]};
958
959	0	0	0				if (defined($max_type))
		0	0
960							{
961	0	0					if ($max_type eq 'term_dict')
		0
962							{
963	0						my $surface_form=$token;
964	0						my $can_form=$match_can_form;
965	0						my $onto=$pathtype;
966	0						my $j;
967	0						for ($j=$i+1;$j
968							{
969	0						my ($token,$POS,$lemma,$max_type,$match_can_form,
970							$onto_path)
971	0						=@{$ds->[$j]};
972	0	0	0				if (!defined($max_type) \|\| $max_type ne 'term_dict'
			0
973							\|\| $match_can_form ne $can_form)
974							{
975	0						last;
976							}
977	0						$surface_form.=" $token";
978	0						$onto=$onto_path;
979							}
980	0						$XML.="\n";
981	0						$XML.=" $surface_form \n";
982	0						$XML.=" $can_form\n";
983	0	0	0				if ( defined($onto) && $onto ne "" )
984							{
985	0						$onto =~ s/\/$//;
986	0						$XML .= " $onto\n";
987							}
988	0						$XML.="\n";
989	0						$i=$j-1;
990							}
991							elsif ($max_type eq 'NE_dict')
992							{
993	0						my $surface_form=$token;
994	0						my $can_form=$match_can_form;
995	0						my $type=$pathtype;
996	0						my $j;
997	0						for ($j=$i+1;$j
998							{
999	0						my ($token,$POS,$lemma,$max_type,$match_can_form,$NE_type)
1000	0						=@{$ds->[$j]};
1001	0	0	0				if ($max_type ne 'NE_dict' \|\| $match_can_form ne $can_form)
1002							{
1003	0						last;
1004							}
1005	0						$surface_form.=" $token";
1006	0						$type=$NE_type;
1007							}
1008	0						$XML.="\n";
1009	0						$XML.=" $surface_form \n";
1010	0						$XML.=" $can_form\n";
1011	0	0	0				if ( defined($type) && $type ne "" ) {
1012	0	0					if ( $type !~ /\// ) {
1013	0						$XML.=" $type\n";
1014							} else {
1015	0						$XML.=" $type\n";
1016							}
1017							}
1018	0						$XML.="\n";
1019	0						$i=$j-1;
1020							}
1021							}
1022							elsif (defined($lemma) && $POS =~ /$self->{lemmaSearch}/o
1023							&& $self->{keepLemmas})
1024							{
1025	0						$XML.="\n";
1026	0						$XML.=" $token \n";
1027	0						$XML.=" $lemma\n";
1028	0						$POS=$self->_map_POS($POS);
1029	0						$XML.=" $POS\n";
1030	0						$XML.="\n";
1031							}
1032							else # no analysis
1033							{
1034	0						$XML.="$token\n";
1035							}
1036							}
1037	0						$XML.="\n";
1038							}
1039	0						$XML .= " " . $self->{finalForm} . "\n";
1040
1041	0						return $XML;
1042							}
1043
1044							sub _map_POS
1045							{
1046	0			0			my $self=shift;
1047	0						my $POS=shift;
1048
1049	0	0					if ($POS=~/^V.*/)
		0
1050							{
1051	0						$POS='V';
1052							}
1053							elsif ($POS=~/^N.*/)
1054							{
1055	0						$POS='N';
1056							}
1057
1058	0						return $POS;
1059							}
1060
1061							sub _apply_nulltagger
1062							{
1063	0			0			my $self=shift;
1064	0						my $query=shift;
1065	0						my @lemmatized=($query, 'SENT', undef);
1066	0						return \@lemmatized;
1067							}
1068
1069							sub _apply_treetagger
1070							{
1071	0			0			my $self=shift;
1072	0						my $query=shift;
1073
1074	0						my $stringified=join(" ",@$query);
1075
1076	0						my $tagged_txt=&Alvis::Treetagger::tag($stringified);
1077	0						my @lemmatized=();
1078	0						for my $l (split(/\n/,$tagged_txt))
1079							{
1080	0						my ($token,$POS,$lemma)=split(/\t/,$l);
1081							# restore CD lemmas to the form
1082	0	0	0				if ( $POS eq "CD" ) {
		0
1083	0						$lemma = $token;
1084							}
1085							# Treetagger will make some proper nouns lower case,
1086							# so we assume in query text user only enters
1087							# upper case on purpose
1088							elsif ( $token eq ucfirst($token)
1089							&& lc($token) eq $lemma ) {
1090	0						$lemma = $token;
1091							}
1092	0						push(@lemmatized,[$token,$POS,$lemma]);
1093							}
1094
1095	0						return \@lemmatized;
1096							}
1097
1098							sub _apply_lemma_dict
1099							{
1100	0			0			my $self=shift;
1101	0						my $tagger_output=shift;
1102
1103	0						my @lemmatized=();
1104
1105							# Could be simplified, but don't have the energy any more
1106	0	0					if (defined($self->{lemma_dict}))
1107							{
1108							# not tested btw
1109							#
1110	0						for my $t (@$tagger_output)
1111							{
1112	0						my ($token,$POS,$lemma)=@$t;
1113
1114	0	0					if ( $token =~ /^([a-z0-9\-\_\.]+)=(.*)/i ) {
		0
		0
		0
1115	0						$token =~ s/\#\#/ /g;
1116	0						push(@lemmatized,[$token,'INDEX',undef]);
1117							} elsif ( $token =~ /\#\#/ ) {
1118							# Wray's hack on hack
1119	0						$token =~ s/\#\#/ /g;
1120	0						$token = "\"$token\"";
1121	0						push(@lemmatized,[$token,'TEXT',undef]);
1122							} elsif ( $token =~ /_/ ) {
1123	0						$token =~ s/_+/ /g;
1124	0						$token =~ s/ $//;
1125	0						$token =~ s/^ //;
1126	0						push(@lemmatized,[$token,'TAG',undef]);
1127							} elsif ($lemma eq "") # try to fix
1128							{
1129	0	0					if (defined($self->{lemma_dict}{$token}{lemma}))
1130							{
1131	0						my $actual_POS;
1132	0	0					if (defined($self->{lemma_dict}{$token}{POS}))
1133							{
1134	0						$actual_POS=$self->{lemma_dict}{$token}{POS};
1135							}
1136
1137	0						push(@lemmatized,[$token,$actual_POS,
1138							$self->{lemma_dict}{$token}{lemma}]);
1139							}
1140							else # mark as lacking
1141							{
1142	0						push(@lemmatized,[$token,$POS,undef]);
1143							}
1144							}
1145							else
1146							{
1147	0						push(@lemmatized,[$token,$POS,$lemma]);
1148							}
1149							}
1150							}
1151							else # no lemma dictionary so mark as lacking
1152							{
1153	0						for my $t (@$tagger_output)
1154							{
1155	0						my ($token,$POS,$lemma)=@$t;
1156
1157	0	0					if ( $token =~ /^([a-z0-9\-\_\.]+)=(.*)/i ) {
		0
		0
		0
1158	0						$token =~ s/\#\#/ /g;
1159	0						push(@lemmatized,[$token,'INDEX',undef]);
1160							} elsif ( $token =~ /\#\#/ ) {
1161							# Wray's hack on hack
1162	0						$token =~ s/\#\#/ /g;
1163	0						$token = "\"$token\"";
1164	0						push(@lemmatized,[$token,'TEXT',undef]);
1165							} elsif ( $token =~ /_/ ) {
1166	0						$token =~ s/_+/ /g;
1167	0						$token =~ s/ $//;
1168	0						$token =~ s/^ //;
1169	0						push(@lemmatized,[$token,'TAG',undef]);
1170							} elsif ($lemma eq "") {
1171	0						push(@lemmatized,[$token,$POS,undef]);
1172							}
1173							else
1174							{
1175	0						push(@lemmatized,[$token,$POS,$lemma]);
1176							}
1177							}
1178							}
1179
1180	0						return \@lemmatized;
1181							}
1182
1183							sub _apply_terms_and_NEs
1184							{
1185	0			0			my $self=shift;
1186	0						my $lemmatized=shift;
1187
1188	0						my @exp=();
1189
1190	0	0	0				if (!defined($self->{term_dict}) \|\| !defined($self->{NE_dict}))
1191							{
1192	0						$self->_set_err_state($ERR_NEED_BOTH_TERM_AND_NE);
1193	0						return undef;
1194							}
1195							else
1196							{
1197	0						for (my $start=0; $start
1198							{
1199	0						my ($token,$POS,$lemma)=@{$lemmatized->[$start]};
	0
1200
1201							# dont check the special cases
1202	0	0	0				if ( $POS eq 'INDEX' \|\| $POS eq 'TEXT' ) {
1203	0						$start++;
1204	0						next;
1205							}
1206
1207							# find the longest match
1208	0						my $max=0;
1209	0						my $max_type;
1210							my $max_form;
1211
1212							# longest possible match
1213	0						my $maxmax = scalar(@$lemmatized)-$start;
1214	0						my $term = "";
1215
1216	0	0					if ( $maxmax > $self->{termMaxLen} ) {
1217	0						$maxmax = $self->{termMaxLen};
1218							}
1219							# tags get checked without any following sequence
1220	0	0					if ( $POS eq 'TAG' ) {
1221	0						$maxmax = 1;
1222							}
1223	0						for ( my $i=0; $i<$maxmax; $i++) {
1224	0	0					if ( $i>0 ) {
1225	0						my $tpos = $lemmatized->[$start+$i]->[1];
1226							# these types mark a hard boundary
1227	0	0	0				if ( $tpos eq 'INDEX' \|\| $tpos eq 'TAG' \|\| $tpos eq 'TEXT' ) {
			0
1228	0						last;
1229							}
1230							}
1231	0						my $lemma;
1232	0	0					if (!defined($lemmatized->[$start+$i]->[2]))
1233							{
1234	0						$lemma=$lemmatized->[$start+$i]->[0];
1235							}
1236							else
1237							{
1238	0						$lemma=$lemmatized->[$start+$i]->[2];
1239							}
1240	0						$term .= " " . $lemma;
1241	0						my $max_match;
1242	0	0					if ( defined($max_match=
	0	0
1243	0						$self->{term_dict}->{&{$self->{tcanon}}($term)}) ) {
1244	0						$max = $i+1;
1245	0						$max_type = 'term_dict';
1246	0						$max_form = $max_match;
1247							} elsif ( defined($max_match=
1248							$self->{NE_dict}->{&{$self->{ncanon}}($term)}) ) {
1249	0						$max = $i+1;
1250	0						$max_type = 'NE_dict';
1251	0						$max_form = $max_match;
1252							}
1253							}
1254
1255	0	0					if ($max)
1256							{
1257	0						for (my $i=$start; $i<$start+$max;$i++)
1258							{
1259	0						my ($token,$POS,$lemma)=@{$lemmatized->[$i]};
	0
1260	0						$lemmatized->[$i]=[$token,$POS,$lemma,$max_type,$max_form];
1261							}
1262	0						$start+=$max;
1263							}
1264							else
1265							{
1266	0						$start++;
1267							}
1268							}
1269
1270
1271							}
1272
1273	0						return $lemmatized;
1274							}
1275
1276							sub _apply_typing_rules
1277							{
1278	0			0			my $self=shift;
1279	0						my $lemmatized=shift;
1280
1281	0						my @exp=();
1282
1283	0	0					if (!defined($self->{typing_rules}))
1284							{
1285							# not an error
1286	0						return $lemmatized;
1287							}
1288							else
1289							{
1290	0						for (my $i=0; $i
1291							{
1292	0						my ($token,$POS,$lemma,$max_type,$match_can_form)=
1293	0						@{$lemmatized->[$i]};
1294							# print STDERR "_apply_typing_rules: ($token,$POS,$lemma,$max_type,$match_can_form)\n";
1295							# print STDERR " maps -> " . $self->{typing_rules}{$match_can_form} . "\n";
1296	0	0	0				if (defined($max_type) && $max_type eq 'NE_dict' &&
			0
			0
1297							defined($match_can_form) &&
1298							defined($self->{typing_rules}{$match_can_form}))
1299							{
1300	0						$lemmatized->[$i]=
1301							[$token,$POS,$lemma,$max_type,$match_can_form,
1302							$self->{typing_rules}{$match_can_form}];
1303							}
1304							}
1305
1306							}
1307
1308	0						return $lemmatized;
1309							}
1310
1311							sub _apply_onto
1312							{
1313	0			0			my $self=shift;
1314	0						my $lemmatized=shift;
1315
1316	0						my @exp=();
1317
1318	0	0	0				if (!(defined($self->{onto_nodes}) && defined($self->{onto_paths})))
1319							{
1320							# not an error, but need both or nothing
1321	0						return $lemmatized;
1322							}
1323							else
1324							{
1325	0						for (my $i=0; $i
1326							{
1327	0						my ($token,$POS,$lemma,$max_type,$match_can_form)=
1328	0						@{$lemmatized->[$i]};
1329	0	0	0				if ( defined($match_can_form) &&
1330							defined($self->{onto_nodes}{$match_can_form}))
1331							{
1332	0						my $node=$self->{onto_nodes}{$match_can_form};
1333	0	0					if ( defined($self->{onto_paths}{$node}) )
1334							{
1335	0						my $path = $self->{onto_paths}{$node};
1336	0	0					if ( $path ne "" ) {
1337	0						$lemmatized->[$i] =
1338							[$token,$POS,$lemma,$max_type,$match_can_form,"$path/"];
1339							}
1340							}
1341							}
1342							}
1343
1344
1345							}
1346
1347	0						return $lemmatized;
1348							}
1349
1350
1351							1;
1352							__END__