File Coverage

blib/lib/Text/Distill.pm

Criterion	Covered	Total	%
statement	12	14	85.7
branch			n/a
condition			n/a
subroutine	5	5	100.0
pod			n/a
total	17	19	89.4

line	stmt	sub	time	code
1				package Text::Distill;
2
3	1	1	51128	use 5.006001;
	1		6
4	1	1	7	use strict;
	1		3
	1		31
5	1	1	6	use warnings;
	1		3
	1		44
6	1	1	370	use Digest::JHash;
	1		579
	1		58
7	1	1	135	use XML::LibXML;
	0
	0
8				use XML::LibXSLT;
9				use Text::Extract::Word;
10				use HTML::TreeBuilder;
11				use OLE::Storage_Lite;
12				use Text::Unidecode v1.27;
13				use Unicode::Normalize v1.25;
14				use Encode::Detect;
15				use Encode;
16				use Archive::Zip qw( :ERROR_CODES :CONSTANTS );
17				use Carp;
18				use LWP::UserAgent;
19				use JSON::XS;
20				use File::Temp;
21
22				Archive::Zip::setErrorHandler(sub{});
23
24				our (@ISA, @EXPORT_OK);
25				BEGIN {
26				require Exporter;
27				@ISA = qw(Exporter);
28				@EXPORT_OK = qw(
29				Distill
30				LikeSoundex
31				TextToGems
32				DetectBookFormat
33				ExtractSingleZipFile
34				CheckIfTXT
35				CheckIfFB2
36				CheckIfFB3
37				CheckIfDocx
38				CheckIfEPub
39				CheckIfDoc
40				CheckIfTXTZip
41				CheckIfFB2Zip
42				CheckIfDocxZip
43				CheckIfEPubZip
44				CheckIfDocZip
45				ExtractTextFromEPUBFile
46				ExtractTextFromDOCXFile
47				ExtractTextFromDocFile
48				ExtractTextFromTXTFile
49				ExtractTextFromFB2File
50				ExtractTextFromFB3File
51				GetFB2GemsFromFile
52				GemsValidate
53				); # symbols to export on request
54				}
55
56				my $XSL_FB2_2_String = q{
57
58				xmlns:fb="http://www.gribuser.ru/xml/fictionbook/2.0">
59
60
61
62
63
64
65				66	fb:title\|
67				fb:subtitle\|
68				fb:p\|
69				fb:epigraph\|
70				fb:cite\|
71				fb:text-author\|
72				fb:date\|
73				fb:poem\|
74				fb:stanza\|
75				fb:v\|
76				fb:image[parent::fb:body]\|
77				fb:code">
78
79
80
81				};
82
83				my $XSL_FB3_2_String = q{
84
85				xmlns:fb="http://www.fictionbook.org/FictionBook3/body"
86				xmlns:fbd="http://www.fictionbook.org/FictionBook3/description">
87
88
89
90
91
92				93	fb:p\|
94				fb:li\|
95				fb:page-break-type">
96
97
98
99
100
101
102				-
103				};
104
105				my $XSL_Docx_2_Txt = q{
106
107				xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125				};
126
127				our $MinPartSize = 150;
128
129				# Гласные и прочие буквы \w, которые нас, тем не менее, не волнуют
130				my $SoundexExpendable = qr/уеёыаоэяиюьъaehiouwy/i;
131
132				# Статистически подобранные "буквосочетания", бьющие тексты на на куски по ~20к
133				# отбиралось по языкам: ru en it de fr es pl be cs sp lv
134				# в теории этот набор должен более-менее ровно нарезать любой текст на куски по ~2к
135				our @SplitChars = qw(3856 6542 4562 6383 4136 2856 4585 5512
136				2483 5426 2654 3286 5856 4245 4135 4515 4534 8312 5822 5316 1255 8316 5842);
137
138
139				my @DetectionOrder = qw /epub.zip epub docx.zip docx doc.zip doc fb2.zip fb2 fb3 txt.zip txt/;
140
141				my $Detectors = {
142				'fb2.zip' => \&CheckIfFB2Zip,
143				'fb2' => \&CheckIfFB2,
144				'fb3' => \&CheckIfFB3,
145				'doc.zip' => \&CheckIfDocZip,
146				'doc' => \&CheckIfDoc,
147				'docx.zip' => \&CheckIfDocxZip,
148				'docx' => \&CheckIfDocx,
149				'epub.zip' => \&CheckIfEPubZip,
150				'epub' => \&CheckIfEPub,
151				'txt.zip' => \&CheckIfTXTZip,
152				'txt' => \&CheckIfTXT
153				};
154
155				our $Extractors = {
156				'fb2' => \&ExtractTextFromFB2File,
157				'fb3' => \&ExtractTextFromFB3File,
158				'txt' => \&ExtractTextFromTXTFile,
159				'doc' => \&ExtractTextFromDocFile,
160				'docx' => \&ExtractTextFromDOCXFile,
161				'epub' => \&ExtractTextFromEPUBFile,
162				};
163
164				our $rxFormats = join '\|', keys %$Detectors;
165				$rxFormats =~ s/\./\\./g;
166
167				use constant FB3_META_REL => 'http://www.fictionbook.org/FictionBook3/relationships/Book';
168				use constant FB3_BODY_REL => 'http://www.fictionbook.org/FictionBook3/relationships/body';
169
170
171				=head1 NAME
172
173				Text::Distill - Quick texts compare, plagiarism and common parts detection
174
175				=head1 VERSION
176
177				Version 0.091
178
179				=cut
180
181				our $VERSION = '0.091';
182
183
184				=head1 SYNOPSIS
185
186				use Text::Distill qw(Distill);
187
188				my $DistilledText1 = Distill($text1);
189				my $DistilledText2 = Distill($text2);
190
191				$DistilledText1 eq $DistilledText2 ? print("Equal") : print("Not equal");
192
193				or
194
195				use Text::Distill;
196
197				my $FileFormat = Text::Distill::DetectBookFormat($FilePath);
198				die "Not a fb2.zip file" if $FileFormat ne 'fb2.zip';
199
200				my $Text = Text::Distill::ExtractTextFromFB2File($FilePath);
201				my $Gems = TextToGems($Text);
202
203				my $VURL = 'http://partnersdnld.litres.ru/copyright_check_by_gems/';
204				my $TextInfo = Text::Distill::GemsValidate($Gems,$VURL);
205
206				die "Copyright-protected content" if $TextInfo->{verdict} eq 'protected';
207
208				=head1 Distilling gems from text
209
210				=head2 TextToGems($UTF8TextString)
211
212				Transforms a text (valid UTF8 expected) into an array of 32-bit hash-summs
213				(Jenkins's Hash). Text is at first flattened the hard
214				way (something like soundex, see Distill below), than splitted into fragments by statistically
215				choosen sequences. First and the last fragments are rejected, short fragments are
216				rejected as well, from remaining strings calc hashes and
217				returns reference to them in the array.
218
219				What you really need to know is that TextToGem's from exactly the same texts are
220				eqlal, texts with small changes have similar "gems" as well. And
221				if two texts have 3+ common gems - they share some text parts, for sure. This is somewhat
222				close to "Edit distance", but fast on calc and indexable. So you can effectively
223				search for citings or plagiarism. Choosen split-method makes average detection
224				segment about 2k of text (1-2 paper pages), so this package will not normally detect
225				a single equal paragraph. If you need more precise match extended
226				@Text::Distill::SplitChars with some
227				sequences from SeqNumStats.xlsx on GitHub, I guiess you can get down to parts of
228				about 300 chars without problems. Just don't forget to lower
229				$Text::Distill::MinPartSize as well and keep in mind GemsValidate will break
230				if you play with $MinPartSize and @SplitChars.
231
232				Should return about one 32-bit jHash from every 2kb of source text
233				(may vary depending on the text thou).
234
235				my $Gems = TextToGems($String);
236				print join(',',@$Gems);
237
238
239				=pod
240
241				=head2 Distill($UTF8TextString)
242
243				Transforming the text (valid UTF8 expected) into a sequence of 1-8 numbers
244				(string as well). Internally used by TextToGems, but you may use it's output
245				with standart "edit distance" algorithm, like L. Distilled string
246				is shorter, so you math will go much faster.
247
248				At the end works somewhat close to 'soundex' with addition of some basic rules
249				for cyrillic chars, pre- and post-cleanup and utf normalization. Drops strange
250				sequences, drops short words as well (how are you going to make you plagiarism
251				without copying the long words, huh?)
252
253				$Distilled = Distill($Text); # $Distilled should be ~60% shorter than $Text
254
255				=head1 Remote validation
256
257				There is at least one open service to check your text against
258				known text database, docs are here: L.
259
260				=head2 GemsValidate(\@Gems, $Url)
261
262				Checks your gems against remote database, returns overall verdict
263				and a structure with info on found titles
264
265				=cut
266
267				sub GemsValidate {
268				my $Gems = shift;
269				my $Url = shift;
270
271				my $ua = new LWP::UserAgent;
272				$ua->timeout(5);
273				my $Response = $ua->post( $Url, {gems => join ",",@$Gems});
274
275				my $Result;
276				if ($Response->is_success) {
277				return decode_json( $Response->decoded_content );
278				} else {
279				die $Response->status_line;
280				}
281
282				}
283
284				# EXTRACT BLOCK
285
286				=head1 Service functions
287
288				=head2 ExtractTextFromFB2File($FilePath)
289
290				Function receives a path to the fb2-file and returns all significant text from the file as a string
291
292				=cut
293
294				sub ExtractTextFromFB2File {
295				my $FN = shift;
296
297				my $parser = XML::LibXML->new();
298				my $xslt = XML::LibXSLT->new();
299				my $source = $parser->parse_file($FN);
300				my $style_doc = $parser->load_xml(string => $XSL_FB2_2_String);
301				my $stylesheet = $xslt->parse_stylesheet($style_doc);
302				my $results = $stylesheet->transform($source);
303				my $Out = $stylesheet->output_string($results);
304
305				return $Out;
306				}
307
308				=head2 ExtractTextFromFB3File($FilePath)
309
310				Function receives a path to the fb3-file and returns all significant text from the file as a string
311
312				=cut
313
314				sub ExtractTextFromFB3File {
315				my $FN = shift;
316
317				unless( -e $FN ) {
318				Carp::confess( "$FN doesn't exist" );
319				}
320
321				# Prepare XML parser, XSLT stylesheet and XPath Context beforehand
322				my $XML = XML::LibXML->new;
323				my $StyleDoc = $XML->load_xml( string => $XSL_FB3_2_String );
324				my $Stylesheet = XML::LibXSLT->new->parse_stylesheet( $StyleDoc );
325				my $XC = XML::LibXML::XPathContext->new;
326				$XC->registerNs( opcr => 'http://schemas.openxmlformats.org/package/2006/relationships' );
327
328				# FB3 is ZIP archive following Open Packaging Conventions. Let's find FB3 Body in it
329				my $Zip = Archive::Zip->new();
330				my $ReadStatus = $Zip->read( $FN );
331				unless( $ReadStatus == AZ_OK ) {
332				Carp::confess "[Archive::Zip error] $!";
333				}
334				# First we must find package Rels file
335				my $PackageRelsXML = $Zip->contents( '_rels/.rels' )
336				or do{ $! = 11; Carp::confess 'Broken OPC package, no package Rels file (/_rels/.rels)' };
337
338				# Next find FB3 meta relation(s)
339				my $PackageRelsDoc = eval{ XML::LibXML->load_xml( string => $PackageRelsXML ) }
340				or do{ $! = 11; Carp::confess "Invalid XML: $@" };
341
342				my @RelationNodes = $XC->findnodes(
343				'/opcr:Relationships/opcr:Relationship[@Type="'.FB3_META_REL.'"]',
344				$PackageRelsDoc
345				);
346				unless( @RelationNodes ) {
347				$! = 11;
348				Carp::confess 'No relation to FB3 meta';
349				}
350
351				# There could be more than one book packed in FB3, so continue by parsing all the books found
352				my $Result = '';
353				for my $RelationNode ( @RelationNodes ) {
354				# Get FB3 meta name from relation
355				my $MetaName = OPCPartAbsoluteNameFromRelative( $RelationNode->getAttribute('Target'), '/' );
356				# Name in zip has no leading slash and name in OPC has it. Remove leading slash from OPC name
357				$MetaName =~ s:^/::;
358
359				# Get FB3 meta Rels file name
360				my $MetaRelsName = $MetaName;
361				$MetaRelsName =~ s:^(./)?([^/])$:${1}_rels/${2}.rels:;
362
363				my $MetaRelsXML = $Zip->contents( $MetaRelsName )
364				or do{ $! = 11; Carp::confess "No FB3 meta Rels file (expecting $MetaRelsName)" };
365
366				# Next we get relation to FB3 body from FB3 meta Rels file
367				my $MetaRelsDoc = eval{ $XML->load_xml( string => $MetaRelsXML ) }
368				or do{ $! = 11; Carp::confess "Invalid XML: $@" };
369
370				my( $BodyRelation ) = $XC->findnodes(
371				'/opcr:Relationships/opcr:Relationship[@Type="'.FB3_BODY_REL.'"]',
372				$MetaRelsDoc
373				);
374				unless( $BodyRelation ) {
375				$! = 11;
376				Carp::confess "No relation to FB3 body in $MetaRelsName";
377				}
378
379				# Get FB3 body name from relation
380				my $CurrentDir = $MetaName;
381				$CurrentDir =~ s:/?[^/]*$::;
382				my $BodyName = OPCPartAbsoluteNameFromRelative(
383				$BodyRelation->getAttribute('Target'),
384				"/$CurrentDir" # add leading slash (zip name to opc)
385				);
386				$BodyName =~ s:^/::; # remove leading slash (opc name to zip)
387
388				# Get FB3 body text
389				my $BodyXML = $Zip->contents( $BodyName )
390				or do{ $! = 11; Carp::confess "No FB3 body (expecting $BodyName)" };
391
392				# Transform it into plain text
393				my $BodyDoc = $XML->load_xml( string => $BodyXML, huge => 1 );
394				my $TransformResults = $Stylesheet->transform( $BodyDoc );
395				$Result .= $Stylesheet->output_string( $TransformResults );
396				}
397
398				return $Result;
399				}
400
401				=head2 ExtractTextFromTXTFile($FilePath)
402
403				Function receives a path to the text-file and returns all significant text from the file as a string
404
405				=cut
406
407				sub ExtractTextFromTXTFile {
408				my $FN = shift;
409				open(TEXTFILE, "<$FN");
410				my $String = join('', );
411				close TEXTFILE;
412
413				require Encode::Detect;
414				return Encode::decode('Detect', $String);
415				}
416
417
418				=head2 ExtractTextFromDocFile($FilePath)
419
420				Function receives a path to the doc-file and returns all significant text from the file as a string
421
422				=cut
423
424				sub ExtractTextFromDocFile {
425				my $FilePath = shift;
426
427				my $File = Text::Extract::Word->new($FilePath);
428				my $Text = $File->get_text();
429
430				return $Text;
431				}
432
433				=head2 ExtractTextFromDOCXFile($FilePath)
434
435				Function receives a path to the docx-file and returns all significant text from the file as a string
436
437				=cut
438
439				sub ExtractTextFromDOCXFile {
440				my $FN = shift;
441
442				my $Result;
443				my $arch = Archive::Zip->new();
444				if ( $arch->read($FN) == AZ_OK ) {
445				if (my $DocumentMember = $arch->memberNamed( 'word/document.xml' )) {
446				my $XMLDocument = $DocumentMember->contents();
447
448				my $xml = XML::LibXML->new();
449				my $xslt = XML::LibXSLT->new();
450
451				my $Document;
452				eval { $Document = $xml->parse_string($XMLDocument); };
453				if ($@) {
454				$! = 11;
455				Carp::confess("[libxml2 error ". $@->code() ."] ". $@->message());
456				}
457
458				my $StyleDoc = $xml->load_xml(string => $XSL_Docx_2_Txt);
459
460				my $StyleSheet = $xslt->parse_stylesheet($StyleDoc);
461
462				my $TransformResult = $StyleSheet->transform($Document);
463
464				$Result = $StyleSheet->output_string($TransformResult);
465				}
466				} else {
467				Carp::confess("[Archive::Zip error] $!");
468				}
469
470				return $Result;
471				}
472
473				=head2 ExtractTextFromEPUBFile($FilePath)
474
475				Function receives a path to the epub-file and returns all significant text from the file as a string
476
477				=cut
478
479				sub ExtractTextFromEPUBFile {
480				my $FN = shift;
481
482				my $Result;
483				my $arch = Archive::Zip->new();
484				if ( $arch->read($FN) == AZ_OK ) {
485				my $requiredMember = 'META-INF/container.xml';
486				if (my $ContainerMember = $arch->memberNamed( $requiredMember )) {
487				my $XMLContainer = $ContainerMember->contents();
488
489				my $xml = XML::LibXML->new;
490				my $xpc = XML::LibXML::XPathContext->new();
491				$xpc->registerNs('opf', 'urn:oasis:names:tc:opendocument:xmlns:container');
492
493				my $Container;
494				eval { $Container = $xml->parse_string($XMLContainer); };
495				if ($@) {
496				$! = 11;
497				Carp::confess("[libxml2 error ". $@->code() ."] ". $@->message());
498				}
499
500				my ($ContainerNode) = $xpc->findnodes('//opf:container/opf:rootfiles/opf:rootfile', $Container);
501				my $ContentPath = $ContainerNode->getAttributeNode('full-path')->string_value;
502				if (my $ContentMember = $arch->memberNamed( $ContentPath )) {
503				my $XMLContent = $ContentMember->contents();
504
505				$xpc->unregisterNs('opf');
506				$xpc->registerNs('opf', 'http://www.idpf.org/2007/opf');
507
508				my $Content;
509				eval { $Content = $xml->parse_string($XMLContent); };
510				if ($@) {
511				$! = 11;
512				Carp::confess("[libxml2 error ". $@->code() ."] ". $@->message());
513				}
514				my @ContentNodes = $xpc->findnodes('//opf:package/opf:manifest/opf:item[
515				@media-type="application/xhtml+xml"
516				and
517				starts-with(@id, "content")
518				]',
519				$Content
520				);
521				my $HTMLTree = HTML::TreeBuilder->new();
522				foreach my $ContentNode (@ContentNodes) {
523				my $HTMLContentPath = $ContentNode->getAttributeNode('href')->string_value;
524
525				if (my $HTMLContentMember = $arch->memberNamed( $HTMLContentPath )) {
526				my $HTMLContent = $HTMLContentMember->contents();
527
528				$HTMLTree->parse_content($HTMLContent);
529				} else {
530				Carp::confess("[Archive::Zip error] $HTMLContentPath not found in ePub ZIP container");
531				}
532				}
533				$Result = DecodeUtf8($HTMLTree->as_text);
534				} else {
535				Carp::confess("[Archive::Zip error] $ContentPath not found in ePub ZIP container");
536				}
537				} else {
538				Carp::confess("[Archive::Zip error] $requiredMember not found in ePub ZIP container");
539				}
540				} else {
541				Carp::confess("[Archive::Zip error] $!");
542				}
543
544				return $Result;
545				}
546
547				sub OPCPartAbsoluteNameFromRelative {
548				my $Name = shift;
549				my $Dir = shift;
550				$Dir =~ s:/$::; # remove trailing slash
551
552				my $FullName = ( $Name =~ m:^/: ) ? $Name : # $Name has absolute path
553				"$Dir/$Name"; # $Name has relative path
554				$FullName = do{
555				use bytes; # A-Za-z are case insensitive
556				lc $FullName;
557				};
558
559				# parse all . and .. in name
560				my @CleanedSegments;
561				my @OriginalSegments = split m:/:, $FullName;
562				for my $Part ( @OriginalSegments ) {
563				if( $Part eq '.' ) {
564				# just skip
565				} elsif( $Part eq '..' ) {
566				pop @CleanedSegments;
567				} else {
568				push @CleanedSegments, $Part;
569				}
570				}
571
572				return join '/', @CleanedSegments;
573				}
574
575
576				sub ExtractSingleZipFile {
577				my $FN = shift;
578				my $Ext = shift;
579				my $Zip = Archive::Zip->new();
580
581				return unless ( $Zip->read( $FN ) == Archive::Zip::AZ_OK );
582
583				my @Files = $Zip->members();
584				return unless (scalar @Files == 1 && $Files[0]->{fileName} =~ /(\.$Ext)$/);
585
586				my $TmpDir = File::Temp::tempdir(cleanup=>1);
587
588				my $OutFile = $TmpDir.'/check_' . $$ . '_' . $Files[0]->{fileName};
589
590				return $Zip->extractMember( $Files[0], $OutFile ) == Archive::Zip::AZ_OK ? $OutFile : undef;
591				}
592
593				=head2 DetectBookFormat($FilePath, $Format)
594
595				Function detects format of an e-book and returns it. You
596				may suggest the format to start with, this wiil speed up the process a bit
597				(not required).
598
599				$Format can be 'fb2.zip', 'fb2', 'doc.zip', 'doc', 'docx.zip',
600				'docx', 'epub.zip', 'epub', 'txt.zip', 'txt', 'fb3', 'fb3'
601
602				=cut
603
604				sub DetectBookFormat {
605				my $File = shift;
606				my $Format = shift;
607				if (defined $Format && $Format =~/^($rxFormats)$/) {
608				$Format = $1;
609				} else {
610				$Format = '';
611				}
612
613				#$Format первым пойдет
614				my @Formats = ($Format \|\| (), grep{ $_ ne $Format } @DetectionOrder);
615
616				foreach( @Formats ) {
617				return $_ if $Detectors->{$_}->($File);
618				}
619				return;
620				}
621
622
623				our $SplitRegexp = join ('\|',@SplitChars);
624
625				$SplitRegexp = qr/$SplitRegexp/o;
626
627				# Кластеризация согласных - глухие к глухим, звонкие к звонким
628				#my %SoundexClusters = (
629				# '1' => 'бпфвbfpv',
630				# '2' => 'сцзкгхcgjkqsxz',
631				# '3' => 'тдdt',
632				# '4' => 'лйl',
633				# '5' => 'мнmn',
634				# '6' => 'рr',
635				# '7' => 'жшщч'
636				#);
637				#my $SoundexTranslatorFrom;
638				#my $SoundexTranslatorTo;
639				#for (keys %SoundexClusters){
640				# $SoundexTranslatorFrom .= $SoundexClusters{$_};
641				# $SoundexTranslatorTo .= $_ x length($SoundexClusters{$_});
642				#}
643
644				sub TextToGems{
645				my $SrcText = Distill(shift) \|\| return;
646
647				my @DistilledParts = split /$SplitRegexp/, $SrcText;
648
649				# Началу и концу верить всё равно нельзя
650				shift @DistilledParts;
651				pop @DistilledParts;
652				my @Hashes;
653				my %SeingHashes;
654				for (@DistilledParts){
655				# Если отрывок текста короткий - мы его проигнорируем
656				next if length($_)< $MinPartSize;
657
658				# Используется Хеш-функция Дженкинса, хорошо распределенный хэш на 32 бита
659				my $Hash = Digest::JHash::jhash($_);
660
661				# Если один хэш дважды - нам второго не нужно
662				push @Hashes, $Hash unless $SeingHashes{$Hash}++;
663				}
664				return \@Hashes;
665				}
666
667				# Безжалостная мужланская функция, но в нашем случае чем топорней - тем лучше
668				sub LikeSoundex {
669				my $S = shift;
670
671				# Гласные долой, в них вечно очепятки
672				$S =~ s/[$SoundexExpendable]+//gi;
673
674				# Заменяем согласные на их кластер
675				# eval "\$String =~ tr/$SoundexTranslatorFrom/$SoundexTranslatorTo/";
676				$S =~ tr/рrлйlбпфвbfpvтдdtжшщчсцзкгхcgjkqsxzмнmn/664441111111133337777222222222222225555/;
677
678				return $S;
679				}
680
681
682				sub Distill {
683				my $String = shift;
684
685				#Нормализация юникода
686				$String = Unicode::Normalize::NFKC($String);
687
688				#Переводим в lowercase
689				$String = lc($String);
690
691				#Конструкции вида слово.слово разбиваем пробелом
692				$String =~ s/(\w[.,;:&?!*#%+\^\\\/])(\w)/$1 $2/g;
693
694				# Понятные нам знаки причешем до упрощенного вида
695				$String =~ tr/ЁёÉÓÁéóáĀāĂăĄąĆćĈĉĊċČčĎďĐđĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħĨĩĪīĬĭĮįİıĲĳĴĵĶķĸĹĺĻļĽľĿŀŁłŃńŅņŇňŉŊŋŌōŎŏŐőŒœŔŕŖŗŘřŚŜśŝŞşŠšŢţŤťŦŧŨũŪūŬŭŮůŰűŲųŴŵŶŷŸŹźŻżŽžſƒǺǻǼǽǾǿђѓєѕіїјљњћќўџҐґẀẁẂẃẄẅỲỳ/ЕеЕОАеоаAaAaAaCcCcCcCcDdDdEeEeEeEeEeGgGgGgGgHhHhIiIiIiIiIiiiJjKkкLlLlLlLlLlNnNnNnnNnOoOoOoCCRrRrRrSSssSsŠšTtTtTtUuUuUuUuUuUuWwYyYZzZzZzffAaAaOohгеsiijлнhкyuГгWWWWWWYy/;
696
697				# в словах вида папа-ёж глотаем тире (и любой другой мусор)
698				$String =~ s/(\w)([^\w\s]\|_)+(\w)/$1$3/;
699
700				# Короткие слова долой
701				# Короткие русские слова долой (у нас в русском и 6 знаков короткое)
702				$String =~ s/(\s\|^)(\S{1,5}\|[а-я]{6})\b/$1/g;
703
704				# странные конструкции вида -=[мусорсрач]=- долой, ими легко засорить
705				# текст - глаз заигнорит, а робот будет думать что текст о другом. Не будем
706				# облегчать атакующим жизнь
707				$String =~ s/(^\|\s)[^\w\s]+\s?\w+\s*[^\w\s]+($\|\s)/$1$2/g;
708
709				$String =~ s/([^\w\s]\|_)+//g;
710
711				return '' if $String !~ /\w/;
712
713				$String = LikeSoundex($String);
714
715				# Все буквы, которых мы не знаем - перегоняем в транслит, говорят оно даж китайщину жрёт
716				if ($String =~ /[^\d\s]/){
717				$String = lc Text::Unidecode::unidecode($String);
718
719				# Уборка - II, уже для транслитерированной строки
720				$String = LikeSoundex($String);
721				}
722
723				# Убираем повторы
724				$String =~ s/(\w)\1+/$1/gi;
725
726				# слишком длинные слова подрежем (оставив меточку 8, что поработали ножницами)
727				$String =~ s/(\s\|^)(\S{4})\S+\b/${2}8/g;
728
729				# Всё, мы закончили, теперь пробелы убираем, да и до кучи что там еще было
730				$String =~ s/\D//g;
731
732				return $String;
733				}
734
735				# CHECK BLOCK
736
737				=head1 Internals:
738
739				Receives a path to the file and checks whether this file is ...
740
741				B - MS Word .doc in zip-archive
742
743				B - Electronic Publication .epub in zip-archive
744
745				B - MS Word 2007 .docx in zip-archive
746
747				B - FictionBook2 (FB2) in zip-archive
748
749				B - text-file in zip-archive
750
751				B - Electronic Publication .epub
752
753				B - MS Word 2007 .docx
754
755				B - MS Word .doc
756
757				B - FictionBook2 (FB2)
758
759				B - FictionBook3 (FB3)
760
761				B - text-file
762
763				=cut
764
765				sub CheckIfDocZip {
766				my $FN = shift;
767				my $IntFile = ExtractSingleZipFile( $FN, 'doc' ) \|\| return;
768				my $Result = CheckIfDoc( $IntFile );
769				return $Result;
770				}
771
772				sub CheckIfEPubZip {
773				my $FN = shift;
774				my $IntFile = ExtractSingleZipFile( $FN, 'epub' ) \|\| return;
775				my $Result = CheckIfEPub( $IntFile );
776				return $Result;
777				}
778
779				sub CheckIfDocxZip {
780				my $FN = shift;
781				my $IntFile = ExtractSingleZipFile( $FN, 'docx' ) \|\| return;
782				my $Result = CheckIfDocx( $IntFile );
783				return $Result;
784				}
785
786				sub CheckIfFB2Zip {
787				my $FN = shift;
788				my $IntFile = ExtractSingleZipFile( $FN, 'fb2' ) \|\| return;
789				my $Result = CheckIfFB2( $IntFile );
790				return $Result;
791				}
792
793				sub CheckIfTXTZip {
794				my $FN = shift;
795				my $IntFile = ExtractSingleZipFile( $FN, 'txt' ) \|\| return;
796				my $Result = CheckIfTXT( $IntFile );
797				return $Result;
798				}
799
800				sub CheckIfEPub {
801				my $FN = shift;
802
803				my $arch = Archive::Zip->new();
804
805				if ( $arch->read($FN) == AZ_OK ) {
806				if (my $ContainerMember = $arch->memberNamed( 'META-INF/container.xml' )) {
807				my $XMLContainer = $ContainerMember->contents();
808
809				my $xml = XML::LibXML->new;
810				my $xpc = XML::LibXML::XPathContext->new();
811				$xpc->registerNs('opf', 'urn:oasis:names:tc:opendocument:xmlns:container');
812
813				my $Container;
814				eval { $Container = $xml->parse_string($XMLContainer); };
815				return if ($@ \|\| !$Container);
816
817				my ($ContainerNode) = $xpc->findnodes('//opf:container/opf:rootfiles/opf:rootfile', $Container);
818				my $ContentPath = $ContainerNode->getAttributeNode('full-path')->string_value;
819
820				if (my $ContentMember = $arch->memberNamed( $ContentPath )) {
821				my $XMLContent = $ContentMember->contents();
822
823				$xpc->unregisterNs('opf');
824				$xpc->registerNs('opf', 'http://www.idpf.org/2007/opf');
825
826				my $Content;
827				eval { $Content = $xml->parse_string($XMLContent); };
828				return if ($@ \|\| !$Content);
829
830				my @ContentNodes = $xpc->findnodes('//opf:package/opf:manifest/opf:item[
831				@media-type="application/xhtml+xml"
832				and
833				starts-with(@id, "content")
834				and
835				"content" = translate(@id, "0123456789", "")
836				]',
837				$Content
838				);
839
840				my $existedContentMembers = 0;
841				foreach my $ContentNode (@ContentNodes) {
842				my $HTMLContentPath = $ContentNode->getAttributeNode('href')->string_value;
843				$existedContentMembers++ if $arch->memberNamed( $HTMLContentPath );
844				}
845
846				return 1 if (@ContentNodes == $existedContentMembers);
847				}
848				}
849				}
850				return;
851				}
852
853				sub CheckIfDocx {
854				my $FN = shift;
855
856				my $arch = Archive::Zip->new();
857
858				return unless ( $arch->read($FN) == AZ_OK );
859				return 1 if $arch->memberNamed( 'word/document.xml' );
860				}
861
862				sub CheckIfDoc {
863				my $FilePath = shift;
864
865				my $ofs = OLE::Storage_Lite->new($FilePath);
866				my $name = Encode::encode("UCS-2LE", "WordDocument");
867				return $ofs->getPpsSearch([$name], 1, 1);
868				}
869
870				sub CheckIfFB2 {
871				my $FN = shift;
872				my $parser = XML::LibXML->new;
873				my $XML = eval{ $parser->parse_file($FN) };
874				return if( $@ \|\| !$XML );
875				return 1;
876				}
877
878				sub CheckIfFB3 {
879				my $FN = shift;
880
881				my $Zip = Archive::Zip->new();
882				my $XC = XML::LibXML::XPathContext->new;
883				$XC->registerNs( opcr => 'http://schemas.openxmlformats.org/package/2006/relationships' );
884
885				my( $RelsXML, $RelsDoc );
886				if( $Zip->read($FN) == AZ_OK
887				and $RelsXML = $Zip->contents( '_rels/.rels' )
888				and $RelsDoc = eval{ XML::LibXML->load_xml( string => $RelsXML ) }
889				and $XC->exists( '/opcr:Relationships/opcr:Relationship[@Type="'.FB3_META_REL.'"]', $RelsDoc )) {
890
891				return 1;
892
893				} else {
894				return 0;
895				}
896				}
897
898				sub CheckIfTXT {
899				my $FN = shift;
900				my $String = ExtractTextFromTXTFile($FN);
901				return $String !~ /[\x00-\x08\x0B\x0C\x0E-\x1F]/g; #всякие непечатные Control characters говорят, что у нас тут бинарник
902				}
903
904				sub DecodeUtf8 {
905				my $Out = shift;
906				if ($Out && !Encode::is_utf8($Out)) {
907				$Out = Encode::decode_utf8($Out);
908				}
909				return $Out;
910				}
911
912				=head1 REQUIRED MODULES
913
914				Digest::JHash;
915				XML::LibXML;
916				XML::LibXSLT;
917				Encode::Detect;
918				Text::Extract::Word;
919				HTML::TreeBuilder;
920				OLE::Storage_Lite;
921				Text::Unidecode (v1.27 or later);
922				Unicode::Normalize (v1.25 or later);
923				Archive::Zip
924				Encode;
925				Carp;
926				LWP::UserAgent;
927				JSON::XS;
928				File::Temp;
929
930				=head1 SCRIPTS
931
932				=head2 plagiarism_check.pl - checks your ebook againts known texts database
933
934				Script uses check_by_gems API (L). You can
935				select any "check service" provider with CHECKURL (see below),
936				by default text checked with LitRes copyright-check service:
937				L
938
939				B
940
941				> plagiarism_check.pl FILEPATH [CHECKURL] [--full-info] [--help]
942
943				B
944
945				> plagiarism_check.pl /home/file.epub --full-info
946
947				B
948
949				B> path to file for check
950
951				B> url of validating API to check file with. By default:
952				http://partnersdnld.litres.ru/copyright_check_by_gems/
953
954				B> show full info of checked
955
956				B> show this information
957
958				B
959
960				Ebook statuses explained:
961
962				B> there are either copyrights on this book or it is
963				forbidden for distribution by some other reason (racist content, etc)
964
965				B> ebook content owner distributes it for free (but
966				content may still be protected from certan kind use)
967
968				B> this it public domain, no restrictions at all
969
970				B> service have has no valid info on this text
971
972
973				=head1 AUTHOR
974
975				Litres.ru, C<< >>
976				Get the latest code from L
977
978				=head1 BUGS
979
980				Please report any bugs or feature requests to L.
981
982				=head1 SUPPORT
983
984				You can find documentation for this module with the perldoc command.
985
986				perldoc Text::Distill
987
988
989				You can also look for information at:
990
991				=over 4
992
993				=item * RT: CPAN's request tracker (report bugs here)
994
995				L
996
997				=item * AnnoCPAN: Annotated CPAN documentation
998
999				L
1000
1001				=item * CPAN Ratings
1002
1003				L
1004
1005				=item * Search CPAN
1006
1007				L
1008
1009				=back
1010
1011				=head1 LICENSE AND COPYRIGHT
1012
1013				Copyright (C) 2016 Litres.ru
1014
1015				The GNU Lesser General Public License version 3.0
1016
1017				Text::Distill is free software: you can redistribute it and/or modify it
1018				under the terms of the GNU Lesser General Public License as published by
1019				the Free Software Foundation, either version 3.0 of the License.
1020
1021				Text::Distill is distributed in the hope that it will be useful, but
1022				WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
1023				or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
1024				License for more details.
1025
1026				Full text of License L.
1027
1028				=cut
1029
1030				1; # End of Text::Distill