File Coverage

src/nsHebrewProber.cpp

Criterion	Covered	Total	%
statement	20	37	54.0
branch	13	46	28.2
condition			n/a
subroutine			n/a
pod			n/a
total	33	83	39.7

line	stmt	bran	code
1			/* -- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -- */
2			/* *** BEGIN LICENSE BLOCK ***
3			* Version: MPL 1.1/GPL 2.0/LGPL 2.1
4			*
5			* The contents of this file are subject to the Mozilla Public License Version
6			* 1.1 (the "License"); you may not use this file except in compliance with
7			* the License. You may obtain a copy of the License at
8			* http://www.mozilla.org/MPL/
9			*
10			* Software distributed under the License is distributed on an "AS IS" basis,
11			* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12			* for the specific language governing rights and limitations under the
13			* License.
14			*
15			* The Original Code is Mozilla Universal charset detector code.
16			*
17			* The Initial Developer of the Original Code is
18			* Shy Shalom <shooshX@gmail.com>
19			* Portions created by the Initial Developer are Copyright (C) 2005
20			* the Initial Developer. All Rights Reserved.
21			*
22			* Contributor(s):
23			*
24			* Alternatively, the contents of this file may be used under the terms of
25			* either the GNU General Public License Version 2 or later (the "GPL"), or
26			* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27			* in which case the provisions of the GPL or the LGPL are applicable instead
28			* of those above. If you wish to allow use of your version of this file only
29			* under the terms of either the GPL or the LGPL, and not to allow others to
30			* use your version of this file under the terms of the MPL, indicate your
31			* decision by deleting the provisions above and replace them with the notice
32			* and other provisions required by the GPL or the LGPL. If you do not delete
33			* the provisions above, a recipient may use your version of this file under
34			* the terms of any one of the MPL, the GPL or the LGPL.
35			*
36			* *** END LICENSE BLOCK *** */
37
38			#include "nsHebrewProber.h"
39			#include <stdio.h>
40
41			// windows-1255 / ISO-8859-8 code points of interest
42			#define FINAL_KAF ('\xea')
43			#define NORMAL_KAF ('\xeb')
44			#define FINAL_MEM ('\xed')
45			#define NORMAL_MEM ('\xee')
46			#define FINAL_NUN ('\xef')
47			#define NORMAL_NUN ('\xf0')
48			#define FINAL_PE ('\xf3')
49			#define NORMAL_PE ('\xf4')
50			#define FINAL_TSADI ('\xf5')
51			#define NORMAL_TSADI ('\xf6')
52
53			// Minimum Visual vs Logical final letter score difference.
54			// If the difference is below this, don't rely solely on the final letter score distance.
55			#define MIN_FINAL_CHAR_DISTANCE (5)
56
57			// Minimum Visual vs Logical model score difference.
58			// If the difference is below this, don't rely at all on the model score distance.
59			#define MIN_MODEL_DISTANCE (0.01)
60
61			#define VISUAL_HEBREW_NAME ("ISO-8859-8")
62			#define LOGICAL_HEBREW_NAME ("windows-1255")
63
64	8		PRBool nsHebrewProber::isFinal(char c)
65			{
66	8	50	return ((c == FINAL_KAF) \|\| (c == FINAL_MEM) \|\| (c == FINAL_NUN) \|\| (c == FINAL_PE) \|\| (c == FINAL_TSADI));
		50
		50
67			}
68
69	0		PRBool nsHebrewProber::isNonFinal(char c)
70			{
71	0	0	return ((c == NORMAL_KAF) \|\| (c == NORMAL_MEM) \|\| (c == NORMAL_NUN) \|\| (c == NORMAL_PE));
		0
72			// The normal Tsadi is not a good Non-Final letter due to words like
73			// 'lechotet' (to chat) containing an apostrophe after the tsadi. This
74			// apostrophe is converted to a space in FilterWithoutEnglishLetters causing
75			// the Non-Final tsadi to appear at an end of a word even though this is not
76			// the case in the original text.
77			// The letters Pe and Kaf rarely display a related behavior of not being a
78			// good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for
79			// example legally end with a Non-Final Pe or Kaf. However, the benefit of
80			// these letters as Non-Final letters outweighs the damage since these words
81			// are quite rare.
82			}
83
84			/** HandleData
85			* Final letter analysis for logical-visual decision.
86			* Look for evidence that the received buffer is either logical Hebrew or
87			* visual Hebrew.
88			* The following cases are checked:
89			* 1) A word longer than 1 letter, ending with a final letter. This is an
90			* indication that the text is laid out "naturally" since the final letter
91			* really appears at the end. +1 for logical score.
92			* 2) A word longer than 1 letter, ending with a Non-Final letter. In normal
93			* Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with
94			* the Non-Final form of that letter. Exceptions to this rule are mentioned
95			* above in isNonFinal(). This is an indication that the text is laid out
96			* backwards. +1 for visual score
97			* 3) A word longer than 1 letter, starting with a final letter. Final letters
98			* should not appear at the beginning of a word. This is an indication that
99			* the text is laid out backwards. +1 for visual score.
100			*
101			* The visual score and logical score are accumulated throughout the text and
102			* are finally checked against each other in GetCharSetName().
103			* No checking for final letters in the middle of words is done since that case
104			* is not an indication for either Logical or Visual text.
105			*
106			* The input buffer should not contain any white spaces that are not (' ')
107			* or any low-ascii punctuation marks.
108			*/
109	4		nsProbingState nsHebrewProber::HandleData(const char* aBuf, PRUint32 aLen)
110			{
111			// Both model probers say it's not them. No reason to continue.
112	4	50	if (GetState() == eNotMe)
113			return eNotMe;
114
115	4		const char curPtr, endPtr = aBuf+aLen;
116			char cur;
117
118	50	100	for (curPtr = (char*)aBuf; curPtr < endPtr; ++curPtr)
119			{
120	46		cur = *curPtr;
121	46	50	if (cur == ' ') // We stand on a space - a word just ended
122			{
123	0	0	if (mBeforePrev != ' ') // *(curPtr-2) was not a space so prev is not a 1 letter word
124			{
125	0	0	if (isFinal(mPrev)) // case (1) [-2:not space][-1:final letter][cur:space]
126	0		++mFinalCharLogicalScore;
127	0	0	else if (isNonFinal(mPrev)) // case (2) [-2:not space][-1:Non-Final letter][cur:space]
128	0		++mFinalCharVisualScore;
129			}
130			}
131			else // Not standing on a space
132			{
133	46	100	if ((mBeforePrev == ' ') && (isFinal(mPrev)) && (cur != ' ')) // case (3) [-2:space][-1:final letter][cur:not space]
		50
		0
		50
134	0		++mFinalCharVisualScore;
135			}
136	46		mBeforePrev = mPrev;
137	46		mPrev = cur;
138			}
139
140			// Forever detecting, till the end or until both model probers return eNotMe (handled above).
141			return eDetecting;
142			}
143
144			// Make the decision: is it Logical or Visual?
145	0		const char* nsHebrewProber::GetCharSetName()
146			{
147			// If the final letter score distance is dominant enough, rely on it.
148	0		PRInt32 finalsub = mFinalCharLogicalScore - mFinalCharVisualScore;
149	0	0	if (finalsub >= MIN_FINAL_CHAR_DISTANCE)
150			return LOGICAL_HEBREW_NAME;
151	0	0	if (finalsub <= -(MIN_FINAL_CHAR_DISTANCE))
152			return VISUAL_HEBREW_NAME;
153
154			// It's not dominant enough, try to rely on the model scores instead.
155	0		float modelsub = mLogicalProb->GetConfidence() - mVisualProb->GetConfidence();
156	0	0	if (modelsub > MIN_MODEL_DISTANCE)
157			return LOGICAL_HEBREW_NAME;
158	0	0	if (modelsub < -(MIN_MODEL_DISTANCE))
159			return VISUAL_HEBREW_NAME;
160
161			// Still no good, back to final letter distance, maybe it'll save the day.
162	0	0	if (finalsub < 0)
163			return VISUAL_HEBREW_NAME;
164
165			// (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
166	0		return LOGICAL_HEBREW_NAME;
167			}
168
169
170	8		void nsHebrewProber::Reset(void)
171			{
172	8		mFinalCharLogicalScore = 0;
173	8		mFinalCharVisualScore = 0;
174
175			// mPrev and mBeforePrev are initialized to space in order to simulate a word
176			// delimiter at the beginning of the data
177	8		mPrev = ' ';
178	8		mBeforePrev = ' ';
179	8		}
180
181	4		nsProbingState nsHebrewProber::GetState(void)
182			{
183			// Remain active as long as any of the model probers are active.
184	4	50	if ((mLogicalProb->GetState() == eNotMe) && (mVisualProb->GetState() == eNotMe))
		0
		50
185			return eNotMe;
186	4		return eDetecting;
187			}
188
189			#ifdef DEBUG_chardet
190			void nsHebrewProber::DumpStatus()
191			{
192			printf(" HEB: %d - %d [Logical-Visual score]\r\n", mFinalCharLogicalScore, mFinalCharVisualScore);
193			}
194			#endif