File Coverage

blib/lib/Text/Positional/Ngram.pm

Criterion	Covered	Total	%
statement	225	342	65.7
branch	59	142	41.5
condition	3	12	25.0
subroutine	20	33	60.6
pod	14	29	48.2
total	321	558	57.5

line	stmt	bran	cond	sub	pod	time	code
1							#########################################################################
2							# PACKAGE: Text::Positional::Ngram
3							#
4							# Copyright (C), 2004-2007
5							# Bridget Thomson McInnes, bthomson@d.umn.edu
6							#
7							# University of Minnesota, Duluth
8							#
9							# USAGE:
10							# use Text::Positional::Ngram
11							#
12							# DESCRIPTION:
13							#
14							# The Text::Positional::Ngram module determines contiguous and
15							# noncontiguous n-grams and their frequency from a given corpus.
16							# See perldoc Text::Positional::Ngram
17							#
18							#########################################################################
19							package Text::Positional::Ngram;
20
21	1			1		29668	use 5.008;
	1					5
	1					42
22	1			1		6	use strict;
	1					2
	1					35
23	1			1		1082	use bytes;
	1					15
	1					5
24
25							require Exporter;
26	1			1		971	use AutoLoader qw(AUTOLOAD);
	1					1585
	1					5
27
28							our @ISA = qw(Exporter);
29
30							# Items to export into callers namespace by default. Note: do not export
31							# names by default without a very good reason. Use EXPORT_OK instead.
32							# Do not simply export all your public functions/methods/constants.
33
34							# This allows declaration use Text::Positional::Ngram ':all';
35							# If you do not need this, moving things directly into @EXPORT or @EXPORT_OK
36							# will save memory.
37							our %EXPORT_TAGS = ( 'all' => [ qw() ] );
38
39							our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
40
41							our @EXPORT = qw();
42
43							our $VERSION = '0.5';
44
45							#########################
46							# File Name Variables #
47							#########################
48							my $CORPUS_FILE = "";
49							my $VOCAB_FILE = "";
50							my $SNT_FILE = "";
51							my $SNTNGRAM_FILE = "";
52							my $NGRAM_FILE = "";
53							my $STOPLIST = "";
54							my $TOKEN_FILE = "";
55							my $NONTOKEN_FILE = "";
56
57							###########################
58							# User defined Variables #
59							###########################
60							my $max_ngram_size = 2; #default is 2
61							my $min_ngram_size = 2; #default is 2
62							my $ngram_size = 2; #default is 2
63							my $frequency = 0; #default is 0
64							my $window_size = 0; #default is 0
65
66							#########################
67							# Stop List Variables #
68							#########################
69							my $stop_mode = "AND"; #AND/OR default is AND
70							my $stop_regex = ""; #regex to store stop list
71
72							############################
73							# Token Obtion Variables #
74							############################
75							my $tokenizerRegex = "";
76							my $nontokenizerRegex = "";
77
78							####################
79							# Flag Variables #
80							####################
81							my $stop_flag = 0; #default is false
82							my $marginals = 0; #default is false
83							my $remove = 0; #default is false;
84							my $new_line = 0; #default is false
85
86							#####################
87							# Cache Variables #
88							#####################
89							my $unigrams = "";
90							my %remove_hash = ();
91
92							#####################
93							# Array Variables #
94							#####################
95							my @vocab_array = ();
96							my @window_array = ();
97
98							#########################
99							# Main Mask Variables #
100							#########################
101							# VEC VARIABLES
102							my $corpus = ""; # corpus vec
103
104							# MISC VARIABLES
105							my $N = 0; # the length of the corpus
106							my $bit = 32; # the bit size for the vec array
107							my $ngram_count = 0; # the number of ngrams
108							my $win_bit = 1; # the bit size for the windowing
109							my $timestamp = ""; # the time stamp for the files
110
111							###############
112							# new method #
113							###############
114							my $location;
115							sub new
116							{
117							# First argument is class
118	1			1	0	69	my $class = shift;
119
120	1					3	my $self = {};
121
122	1					3	bless $self, $class;
123
124	1	50				12	$self->{dir} = shift if (defined(@_ > 0));
125	1	50				4	$self->{verbose} = @_ ? shift : 0;
126
127	1	50				5	warn "Dir = ", $self->{dir}, "\n" if ($self->{verbose});
128	1	50				3	warn "Verbose = ", $self->{verbose}, "\n" if ($self->{verbose});
129
130							# Initialize some variables at new
131	1					3	$CORPUS_FILE = ""; $VOCAB_FILE = "";
	1					2
132	1					2	$SNT_FILE = ""; $SNTNGRAM_FILE = "";
	1					3
133	1					3	$NGRAM_FILE = ""; $STOPLIST = "";
	1					2
134	1					2	$TOKEN_FILE = ""; $NONTOKEN_FILE = "";
	1					2
135
136	1					2	$stop_flag = 0; $marginals = 0;
	1					2
137	1					21	$remove = 0; $new_line = 0;
	1					3
138	1					1	$ngram_size = 2; $frequency = 0;
	1					2
139	1					1	$window_size = 0;
140
141	1					3	$unigrams = ""; %remove_hash = ();
	1					2
142	1					2	$nontokenizerRegex = ""; $tokenizerRegex = "";
	1					2
143
144	1					3	return $self;
145							}
146
147
148							#######################################
149							# Create the vocabulary and snt file #
150							#######################################
151							sub create_files
152							{
153	1			1	1	6	my $self = shift; my @files = @_;
	1					3
154
155							# Open the corpus, vocab and snt files
156	1	50				114	open(VOCAB , ">$VOCAB_FILE") \|\| die "Could not open the vocabfile: $!\n";
157	1	50				62	open(SNT, ">$SNT_FILE") \|\| die "Could not open the sntfile : $!\n";
158
159							# Create the token and nontoken regular expression
160	1	50				5	if($NONTOKEN_FILE ne "") { set_nontoken_regex(); } set_token_regex();
	0					0
	1					4
161
162							################################################
163							# Index always starts at 2 because 1 is #
164							# considered a new line parameter if defined #
165							################################################
166
167	1					2	my $index = 2; my %vocab_hash = ();
	1					3
168
169	1					2	foreach (@files) {
170	1	50				34	open(CORPUS, $_) \|\| die "Could not find the corpus file: $_\n";
171	1					22	while() {
172	3					4	chomp;
173
174	3					71	s/$nontokenizerRegex//g;
175
176	3					21	while( /$tokenizerRegex/g ) {
177	22					40	my $token = $&;
178
179	22	100				80	if (! exists $vocab_hash{$token} ) {
180	18					45	print SNT "$index ";
181	18					36	print VOCAB "$index\n"; print VOCAB "$token\n";
	18					25
182	18					99	$vocab_hash{$token} = $index++;
183							}
184							else {
185	4					19	print SNT "$vocab_hash{$token} ";
186							}
187							}
188	3	50				8	print SNT "1" if $new_line;
189	3					24	print SNT "\n";
190							}
191							}
192							}
193
194							######################
195							# Remove the files #
196							######################
197							sub remove_files
198							{
199	1			1	1	9	my $self = shift;
200
201	1					7577	system("rm -rf $VOCAB_FILE");
202	1					4624	system("rm -rf $SNT_FILE");
203	1					4545	system("rm -rf $SNTNGRAM_FILE");
204							}
205
206							###########################
207							# Remove the ngram file #
208							###########################
209							sub remove_ngram_file
210							{
211	0			0	0	0	system("rm -rf $NGRAM_FILE");
212							}
213
214
215							############################
216							# Creates the token file #
217							# CODE obtained from NSP #
218							############################
219							sub set_token_regex
220							{
221	1			1	0	3	my $self = shift; my @tokenRegex = (); $tokenizerRegex = "";
	1					2
	1					17
222
223	1	50				18	if(-e $TOKEN_FILE) {
224	1	50				31	open (TOKEN, $TOKEN_FILE) \|\| die "Couldnt open $TOKEN_FILE\n";
225
226	1					30	while() {
227	1					3	chomp; s/^\s//; s/\s$//;
	1					16
	1					6
228	1	50				5	if (length($_) <= 0) { next; }
	0					0
229	1	50	33			11	if (!(/^\//) \|\| !(/\/$/))
230							{
231	0					0	print STDERR "Ignoring regex with no delimiters: $_\n"; next;
	0					0
232							}
233	1					4	s/^\///; s/\/$//;
	1					4
234	1					15	push @tokenRegex, $_;
235							}
236	1					11	close TOKEN;
237							}
238							else {
239	0					0	push @tokenRegex, "\\w+"; push @tokenRegex, "[\.,;:\?!]";
	0					0
240							}
241
242							# create the complete token regex
243
244	1					3	foreach my $token (@tokenRegex)
245							{
246	1	50				5	if ( length($tokenizerRegex) > 0 )
247							{
248	0					0	$tokenizerRegex .= "\|";
249							}
250	1					2	$tokenizerRegex .= "(";
251	1					2	$tokenizerRegex .= $token;
252	1					2	$tokenizerRegex .= ")";
253							}
254
255							# if you dont have any tokens to work with, abort
256	1	50				6	if ( $#tokenRegex < 0 )
257							{
258	0					0	print STDERR "No token definitions to work with.\n";
259	0					0	exit;
260							}
261							}
262
263							##########################################
264							# Set the non token regular expression #
265							# CODE Obtained from NSP #
266							##########################################
267							sub set_nontoken_regex
268							{
269	0			0	0	0	$nontokenizerRegex = "";
270
271							#check if the file exists
272	0	0				0	if($NONTOKEN_FILE)
273							{
274							#open the non token file
275	0	0				0	open(NOTOK, $NONTOKEN_FILE) \|\| die "Couldn't open Nontoken file $NONTOKEN_FILE.\n";
276
277	0					0	while() {
278	0					0	chomp;
279	0					0	s/^\s+//; s/\s+$//;
	0					0
280
281							#handling a blank lines
282	0	0				0	if(/^\s*$/) { next; }
	0					0
283
284	0	0				0	if(!(/^\//)) {
285	0					0	print STDERR "Nontoken regular expression $_ should start with '/'\n"; exit;
	0					0
286							}
287
288	0	0				0	if(!(/\/$/)) {
289	0					0	print STDERR "Nontoken regular expression $_ should end with '/'\n"; exit;
	0					0
290							}
291
292							#removing the / s from the beginning and the end
293	0					0	s/^\///;
294	0					0	s/\/$//;
295
296							#foorm a single regex
297	0					0	$nontokenizerRegex .="(".$_.")\|";
298							}
299
300							# if no valid regexs are found in Nontoken file
301	0	0				0	if(length($nontokenizerRegex)<=0) {
302	0					0	print STDERR "No valid Perl Regular Experssion found in Nontoken file $NONTOKEN_FILE.\n";
303	0					0	exit;
304							}
305
306	0					0	chop $nontokenizerRegex;
307							}
308							else {
309	0					0	print STDERR "Nontoken file $NONTOKEN_FILE doesn't exist.\n";
310	0					0	exit;
311							}
312							}
313
314							##############################
315							# Create the stoplist hash #
316							# CODE obtained from NSP #
317							##############################
318							sub create_stop_list
319							{
320	0			0	1	0	my $self = shift;
321	0					0	my $file = shift;
322
323	0					0	$stop_regex = "";
324
325	0	0				0	open(FILE, $file) \|\| die "Could not open the Stoplist : $!\n";
326
327	0					0	while() {
328	0					0	chomp; # accepting Perl Regexs from Stopfile
329	0					0	s/^\s+//;
330	0					0	s/\s+$//;
331
332							#handling a blank lines
333	0	0				0	if(/^\s*$/) { next; }
	0					0
334
335							#check if a valid Perl Regex
336	0	0				0	if(!(/^\//)) {
337	0					0	print STDERR "Stop token regular expression <$_> should start with '/'\n";
338	0					0	exit;
339							}
340	0	0				0	if(!(/\/$/)) {
341	0					0	print STDERR "Stop token regular expression <$_> should end with '/'\n";
342	0					0	exit;
343							}
344
345							#remove the / s from beginning and end
346	0					0	s/^\///;
347	0					0	s/\/$//;
348
349							#form a single big regex
350	0					0	$stop_regex.="(".$_.")\|";
351							}
352
353	0	0				0	if(length($stop_regex)<=0) {
354	0					0	print STDERR "No valid Perl Regular Experssion found in Stop file.";
355	0					0	exit;
356							}
357
358	0					0	chop $stop_regex;
359
360							# Reset the stop flag to true
361	0					0	$stop_flag = 1;
362
363	0					0	close FILE;
364							}
365
366							###############################
367							# Load the vocabulary array #
368							###############################
369							sub load_vocab_array
370							{
371	1	50		1	0	73	open(VOCAB, $VOCAB_FILE) \|\| die "Could not open the vocab file: $!\n";
372
373	1					3	@vocab_array = ();
374	1					31	while() {
375	18					23	chomp;
376	18					29	my $token = ; chomp $token;
	18					22
377	18					74	$vocab_array[$_] = $token;
378							}
379
380							}
381
382							#################################
383							# Set the windowing parameter #
384							#################################
385							sub set_window_size
386							{
387	0			0	1	0	my $self = shift;
388	0					0	$window_size = shift;
389							}
390
391							##############################
392							# Set the remove parameter #
393							##############################
394							sub set_remove
395							{
396	0			0	1	0	my $self = shift;
397	0					0	$remove = shift;
398							}
399
400							################################
401							# Set the marginal parameter #
402							################################
403							sub set_marginals
404							{
405	1			1	1	8	$marginals = 1;
406							}
407
408							################################
409							# Set the new_line parameter #
410							################################
411							sub set_new_line
412							{
413	0			0	0	0	$new_line = 1;
414							}
415
416							#######################
417							# Set the frequency #
418							#######################
419							sub set_frequency
420							{
421	0			0	1	0	my $self = shift;
422	0					0	$frequency = shift;
423							}
424
425							############################
426							# Set minimum ngram size #
427							############################
428							sub set_ngram_size
429							{
430	0			0	1	0	my $self = shift;
431	0					0	$ngram_size = shift;
432
433	0					0	$min_ngram_size = $ngram_size;
434	0					0	$max_ngram_size = $ngram_size;
435							}
436
437							#######################
438							# Set the stop mode #
439							#######################
440							sub set_stop_mode
441							{
442
443	0			0	1	0	my $self = shift;
444	0					0	$stop_mode = shift;
445							}
446
447							########################
448							# Set the token file #
449							########################
450							sub set_token_file
451							{
452	1			1	1	6	my $self = shift;
453	1					2	$TOKEN_FILE = shift;
454							}
455
456							###########################
457							# Set the nontoken file #
458							###########################
459							sub set_nontoken_file
460							{
461	0			0	1	0	my $self = shift;
462	0					0	$NONTOKEN_FILE = shift;
463							}
464
465							#############################
466							# Set the ngram file name #
467							#############################
468							sub set_destination_file
469							{
470	1			1	1	705	my $self = shift;
471	1					3	my $file = shift;
472
473	1					6	$timestamp = time();
474
475							# Set the file names of the internal files
476							# that will be used by the perl module.
477	1					5	$VOCAB_FILE = $file . ".vocab." . $timestamp;
478	1					3	$SNT_FILE = $file . ".snt." . $timestamp;
479	1					2	$SNTNGRAM_FILE = $file . ".sntngram." . $timestamp;
480
481							# Set the ngram file
482	1					4	$NGRAM_FILE = $file;
483							}
484
485							#################################
486							# Return the number of ngrams #
487							#################################
488							sub get_ngram_count
489							{
490	0			0	1	0	return $ngram_count;
491							}
492
493							###########################
494							# Return the ngram file #
495							###########################
496							sub get_ngram_file
497							{
498	0			0	0	0	return $NGRAM_FILE;
499							}
500
501							##########################################################
502							# METHOD THAT CALLS THE FUNCTIONS TO OBTAIN THE NGRAMS #
503							##########################################################
504							sub get_ngrams
505							{
506
507							# Set the ngram count to zero
508	1			1	1	6	$ngram_count = 0;
509
510							# Create the corpus array
511	1					6	corpus_array();
512
513							# If the window size is 0; set it equal to the
514							# size of the ngram
515	1	50				4	$window_size = $ngram_size if $window_size == 0;
516
517							# check to make certain marginals are not set with
518							# the window size greater than the ngram size
519	1	50				4	$marginals = 0 if $window_size > $ngram_size;
520
521							# create the window
522	1					3	create_window();
523
524							# print ngrams to the snt ngram file
525	1					5	print_sntngrams();
526
527							# print the token ngram to the ngram file
528	1					4	print_ngrams();
529							}
530
531							#############################
532							# Create the corpus array #
533							#############################
534							sub corpus_array
535							{
536							#open SNTFILE
537	1	50		1	0	92	open(SNT, $SNT_FILE) \|\| die "Could not open the sntfile: $!\n";
538
539							# Initialize the variables
540	1					3	my $offset = 0; $corpus = ""; $N = 0;
	1					2
	1					3
541
542	1					13	while(){
543	3					5	chomp;
544	3					20	my @t = split/\s+/;
545	3					8	foreach (@t) { vec($corpus, $offset++, $bit) = $_; $N++; vec($unigrams, $_, $bit)++; }
	22					64
	22					32
	22					74
546							}
547
548							#decrement N by one to obtain the actual size of the corpus
549	1					2	$N--;
550							}
551
552							#################################################
553							# WINDOWING METHODS DESCRIBED BY GIL AND DIAS #
554							#################################################
555							sub create_window
556							{
557	1			1	0	3	my $doc = 0; @window_array = ();
	1					2
558
559	1					3	for my $i(0..$N) {
560
561	22					53	for my $j(0..((2**$window_size)-1) ) {
562
563							# determine the binary representation of one of the possible ngram combinations
564	88					922	my @bitarray = split//, unpack("B32", pack("N", $j));
565
566							# ensure that it is not greater than the corpus size
567	88	100				333	if($i+$ngram_size > ($N+1) ) { next; }
	4					21
568
569							# Reduce the bit array from 32 to the window size
570	84	100				255	my @bits = @bitarray[$#bitarray-$window_size+1 .. $#bitarray]; if($bits[0] != 1) { next; }
	84					231
	42					189
571
572							# Get the size of the possible ngram and if it to large or small - next
573	42	100				58	my $size=0; map{$size+=$_} @bits; if($size!=$ngram_size){ next; }
	42					63
	84					162
	42					91
	21					95
574
575							# Looks like everything is correct so create the bit vec for the positional ngram
576	21					30	my $temp = ""; my $offset = 0;
	21					24
577
578							# Set document number and the start position
579	21					64	vec($temp, $offset++, $bit) = $doc; vec($temp, $offset++, $bit) = $i;
	21					52
580
581							# Create the bit map
582	21					45	map{ vec($temp, $offset++, $win_bit) = $bits[$_] } 0..$#bits; push @window_array, $temp;
	42					161
	21					156
583							}
584							}
585							}
586
587							##############################################
588							# Prints the positional ngrams for testing #
589							##############################################
590							sub print_window
591							{
592	0			0	0	0	print "$#window_array\n";
593	0					0	foreach my $win (@window_array) {
594	0					0	my $start = vec($win, 1, $bit);
595	0					0	my $index = $start;
596	0					0	my @ngram = ();
597	0					0	for my $j(2..$window_size+1) {
598	0	0				0	if( vec($win, $j, $win_bit) == 1 ) {
599	0					0	push @ngram, vec($corpus, $index, $bit);
600							}
601	0					0	$index++;
602							}
603	0					0	print "@ngram\n";
604							}
605							}
606
607							#####################################
608							# Print the sntngrams to the file #
609							#####################################
610							sub print_sntngrams
611							{
612							# Open the SNTGRAM File #
613	1	50		1	0	96	open(SNTNGRAM, ">$SNTNGRAM_FILE") \|\|
614							die "Could not open the SNTNGRAM file : $!\n";
615
616							# Load the vocab hash if doesn;t exist
617	1	50				5	if (!@vocab_array) { load_vocab_array(); }
	1					4
618
619	1					3	my @prev = (); my $freq = 0;
	1					3
620	1					9	foreach my $win (sort byvec @window_array) {
621
622	21					31	my $index = vec($win, 1, $bit); my @ngram = (); my $zero_flag = 0;
	21					24
	21					23
623
624							# Get the ngram
625	21					43	for (2..$window_size+1) {
626	42	50				94	push @ngram, vec($corpus, $index, $bit) if vec($win, $_, $win_bit) == 1; $index++;
	42					65
627							}
628
629	21	50				53	if($ngram[$#ngram] == 0) { next; }
	0					0
630
631							# First time around need to initialize the @prev
632	21	100				81	if($#prev == -1 ) { @prev = @ngram; $freq++;}
	1	50				2
	1					3
633
634							# If @ngram and @prev are not equal print @prev and its freq.
635							# Reinitialize freq and @prev, increment the ngram count.
636							elsif( (join " ", @ngram) ne (join " ", @prev) ) {
637
638							# Check if the ngram is valid
639	20					45	my $return_value = valid( (join " ", @prev), $freq);
640
641							# If it is okay, print the ngram to the sntngram file
642	20	50				44	if($return_value == 1) { print SNTNGRAM "@prev $freq\n"; }
	20					69
643
644							# Initialize the freq and previous ngram
645	20					21	$freq = 1; @prev = @ngram;
	20					64
646
647	0					0	} else { $freq++; }
648							}
649							# Print the last ngram to the sntngram file if it is valid
650	1					4	my $return_value = valid( (join " ", @prev), $freq );
651	1	50				4	if($return_value == 1) { print SNTNGRAM "@prev $freq\n"; }
	1					5
652							}
653
654							#####################################################
655							# Check to determine if the ngram is a valid ngram #
656							######################################################
657							sub valid
658							{
659	21			21	0	58	my @ngram = split/\s+/, shift;
660	21					24	my $freq = shift;
661
662							# Initialize variables
663	21					25	my $doStop = 0; my $line = 0; my @token_ngram = ();
	21					24
	21					30
664
665							# Get the token ngram
666	21					36	map { push @token_ngram, $vocab_array[$ngram[$_]] } 0..$#ngram;
	42					92
667
668							# If stoplist exists determine if the ngram is part of the stoplist
669	21	50				46	if($stop_flag) {
670
671							# Set the doStop flag
672	0	0				0	if($stop_mode=~/OR\|or/) { $doStop = 0; } else { $doStop = 1; }
	0					0
	0					0
673
674	0					0	for my $i(0..$#token_ngram) {
675							# if mode is OR, remove the current ngram if any word is a stop word
676	0	0				0	if($stop_mode=~/OR\|or/) { if($token_ngram[$i]=~/$stop_regex/) { $doStop=1; last; } }
	0	0				0
	0					0
	0					0
677
678							# if mode is AND, accept the current ngram if any word is not a stop word
679	0	0				0	else { if(!($token_ngram[$i]=~/$stop_regex/)) { $doStop=0; last; } }
	0					0
	0					0
680							}
681							# If counting the marginals add the adjustment to the remove_hash
682	0	0	0			0	if($doStop && $marginals) {
683	0					0	for (0..$#ngram) {
684	0	0				0	if(exists $remove_hash{$_ . ":" . $ngram[$_]}) {
685	0					0	$remove_hash{$_ . ":" . $ngram[$_]} += $freq;
686							}
687	0					0	else { $remove_hash{$_ . ":" . $ngram[$_]} = $freq; }
688							}
689							}
690							}
691
692							# If new line determine if the new line exists in the ngram
693	21	0				35	if($new_line) { map { if($_ == 1) { $line++; } } @ngram; }
	0	50				0
	0					0
	0					0
694
695							# If the ngram frequency is greater or equal to a specified frequency, a new
696							# line flag is false and the ngram is not elimanted by the stop list then print
697							# the ngram in its integer form with its frequency to the snt ngram file
698	21	50	33			85	if($doStop == 0 && $line == 0) {
699	21	50				32	if($remove <= $freq) {
700	21					32	$ngram_count+=$freq;
701	21	50				45	if($frequency <= $freq) { return 1; }
	21					54
702							}
703							else {
704	0					0	for (0..$#ngram) {
705	0	0				0	if(exists $remove_hash{$_ . ":" . $ngram[$_]}) {
706	0					0	$remove_hash{$_ . ":" . $ngram[$_]} += $freq;
707							}
708	0					0	else { $remove_hash{$_ . ":" . $ngram[$_]} = $freq; }
709							}
710							}
711							}
712	0					0	return 0;
713							}
714
715							#############################################
716							# Print the positional ngrams to the file #
717							#############################################
718							sub print_ngrams
719							{
720							#open the SNTNGRAM file
721	1	50		1	0	79	open(SNTNGRAM, $SNTNGRAM_FILE) \|\| die "Could not open the sntngram file: $!\n";
722
723							#open the ngram file
724	1	50				68	open(NGRAM, ">$NGRAM_FILE") \|\| die "Could not open the ngram file: $! \n";
725
726							# Load the vocab hash if doesn;t exist
727	1	50				5	if (!@vocab_array) { load_vocab_array(); }
	0					0
728
729							# Print the ngram count
730	1					17	print NGRAM "$ngram_count\n";
731
732	1					20	while() {
733							# get the ngram and its frequency
734	21					24	chomp; my @ngram = split/\s+/, $_; my @marginalFreqs = ();
	21					58
	21					30
735
736	21					66	my $freq = pop @ngram;
737
738	21	50				47	if($marginals) { @marginalFreqs = Marginals(@ngram); }
	21					34
739
740							# print the ngram
741	21					43	for (0..$#ngram) { print NGRAM "$vocab_array[$ngram[$_]]<>"; }
	42					112
742
743							# print the frequencies
744	21					123	print NGRAM "$freq @marginalFreqs \n";
745							}
746							}
747
748
749							# Gets the marginal counts for each individual word in the ngram
750							sub Marginals
751							{
752	21			21	0	27	my @marginalFreqs = ();
753
754	21					44	for my $i(0..$#_) {
755	42					82	push @marginalFreqs, vec($unigrams, $_[$i], $bit);
756
757	42	100				82	if($i == 0) {
758	21	50				57	if($_[$i] == vec($corpus, $N, $bit)) { $marginalFreqs[$#marginalFreqs] -= 1; }
	0					0
759							}
760	42	100				88	if($i == $#_) {
761	21	50				49	if($_[$i] == vec($corpus, 0, $bit)) { $marginalFreqs[$#marginalFreqs] -= 1; }
	0					0
762							}
763
764	42	50	33			186	if($stop_flag \|\| $remove > 0) {
765	0	0				0	if(exists $remove_hash{$i . ":" . $_[$i]}) {
766	0					0	$marginalFreqs[$#marginalFreqs] -= $remove_hash{$i . ":" . $_[$i]};
767							}
768							}
769							}
770	21					58	return @marginalFreqs;
771							}
772
773							#############################
774							# Windowing sort function #
775							#############################
776							sub byvec
777							{
778							# Get the ngrams of the two elements
779	60			60	0	81	my @a_array = (); my @b_array = (); my $z = 0; my $x = 0; my $counter = 0;
	60					63
	60					59
	60					59
	60					61
780	60					69	my $a_index = vec($a, 1, $bit); my $b_index = vec($b, 1, $bit);
	60					61
781	60					150	for my $i(2..$window_size+1) {
782	120	50				216	if(vec($a, $i, $win_bit) == 1) { push @a_array, vec($corpus, $a_index, $bit); } $a_index++;
	120					190
	120					111
783	120	50				236	if(vec($b, $i, $win_bit) == 1) { push @b_array, vec($corpus, $b_index, $bit); } $b_index++;
	120					142
	120					169
784							}
785
786							# Find the first occurence of a non equal token in the ngrams, if exists.
787	60	100				109	for $z(0..$#a_array) { if($a_array[$z]!=$b_array[$z]) { $x = $z; next; } }
	120					213
	115					103
	115					171
788
789	60	50				190	return ( $a_array[$x] > $b_array[$x] ? 1 :
		100
790							($a_array[$x] < $b_array[$x] ? -1 : 0) );
791
792							}
793
794
795							1;
796
797							__END__