| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ | 
| 2 |  |  |  |  |  |  | /* ***** BEGIN LICENSE BLOCK ***** | 
| 3 |  |  |  |  |  |  | * Version: MPL 1.1/GPL 2.0/LGPL 2.1 | 
| 4 |  |  |  |  |  |  | * | 
| 5 |  |  |  |  |  |  | * The contents of this file are subject to the Mozilla Public License Version | 
| 6 |  |  |  |  |  |  | * 1.1 (the "License"); you may not use this file except in compliance with | 
| 7 |  |  |  |  |  |  | * the License. You may obtain a copy of the License at | 
| 8 |  |  |  |  |  |  | * http://www.mozilla.org/MPL/ | 
| 9 |  |  |  |  |  |  | * | 
| 10 |  |  |  |  |  |  | * Software distributed under the License is distributed on an "AS IS" basis, | 
| 11 |  |  |  |  |  |  | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License | 
| 12 |  |  |  |  |  |  | * for the specific language governing rights and limitations under the | 
| 13 |  |  |  |  |  |  | * License. | 
| 14 |  |  |  |  |  |  | * | 
| 15 |  |  |  |  |  |  | * The Original Code is Mozilla Universal charset detector code. | 
| 16 |  |  |  |  |  |  | * | 
| 17 |  |  |  |  |  |  | * The Initial Developer of the Original Code is | 
| 18 |  |  |  |  |  |  | * Netscape Communications Corporation. | 
| 19 |  |  |  |  |  |  | * Portions created by the Initial Developer are Copyright (C) 2001 | 
| 20 |  |  |  |  |  |  | * the Initial Developer. All Rights Reserved. | 
| 21 |  |  |  |  |  |  | * | 
| 22 |  |  |  |  |  |  | * Contributor(s): | 
| 23 |  |  |  |  |  |  | *          Shy Shalom <shooshX@gmail.com> | 
| 24 |  |  |  |  |  |  | * | 
| 25 |  |  |  |  |  |  | * Alternatively, the contents of this file may be used under the terms of | 
| 26 |  |  |  |  |  |  | * either the GNU General Public License Version 2 or later (the "GPL"), or | 
| 27 |  |  |  |  |  |  | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), | 
| 28 |  |  |  |  |  |  | * in which case the provisions of the GPL or the LGPL are applicable instead | 
| 29 |  |  |  |  |  |  | * of those above. If you wish to allow use of your version of this file only | 
| 30 |  |  |  |  |  |  | * under the terms of either the GPL or the LGPL, and not to allow others to | 
| 31 |  |  |  |  |  |  | * use your version of this file under the terms of the MPL, indicate your | 
| 32 |  |  |  |  |  |  | * decision by deleting the provisions above and replace them with the notice | 
| 33 |  |  |  |  |  |  | * and other provisions required by the GPL or the LGPL. If you do not delete | 
| 34 |  |  |  |  |  |  | * the provisions above, a recipient may use your version of this file under | 
| 35 |  |  |  |  |  |  | * the terms of any one of the MPL, the GPL or the LGPL. | 
| 36 |  |  |  |  |  |  | * | 
| 37 |  |  |  |  |  |  | * ***** END LICENSE BLOCK ***** */ | 
| 38 |  |  |  |  |  |  |  | 
| 39 |  |  |  |  |  |  | #include <stdio.h> | 
| 40 |  |  |  |  |  |  | #include "prmem.h" | 
| 41 |  |  |  |  |  |  |  | 
| 42 |  |  |  |  |  |  | #include "nsSBCharSetProber.h" | 
| 43 |  |  |  |  |  |  | #include "nsSBCSGroupProber.h" | 
| 44 |  |  |  |  |  |  |  | 
| 45 |  |  |  |  |  |  | #include "nsHebrewProber.h" | 
| 46 |  |  |  |  |  |  |  | 
| 47 | 4 |  |  |  |  |  | nsSBCSGroupProber::nsSBCSGroupProber() | 
| 48 |  |  |  |  |  |  | { | 
| 49 | 4 | 50 |  |  |  |  | mProbers[0] = new nsSingleByteCharSetProber(&Win1251Model); | 
| 50 | 4 | 50 |  |  |  |  | mProbers[1] = new nsSingleByteCharSetProber(&Koi8rModel); | 
| 51 | 4 | 50 |  |  |  |  | mProbers[2] = new nsSingleByteCharSetProber(&Latin5Model); | 
| 52 | 4 | 50 |  |  |  |  | mProbers[3] = new nsSingleByteCharSetProber(&MacCyrillicModel); | 
| 53 | 4 | 50 |  |  |  |  | mProbers[4] = new nsSingleByteCharSetProber(&Ibm866Model); | 
| 54 | 4 | 50 |  |  |  |  | mProbers[5] = new nsSingleByteCharSetProber(&Ibm855Model); | 
| 55 | 4 | 50 |  |  |  |  | mProbers[6] = new nsSingleByteCharSetProber(&Latin7Model); | 
| 56 | 4 | 50 |  |  |  |  | mProbers[7] = new nsSingleByteCharSetProber(&Win1253Model); | 
| 57 | 4 | 50 |  |  |  |  | mProbers[8] = new nsSingleByteCharSetProber(&Latin5BulgarianModel); | 
| 58 | 4 | 50 |  |  |  |  | mProbers[9] = new nsSingleByteCharSetProber(&Win1251BulgarianModel); | 
| 59 |  |  |  |  |  |  |  | 
| 60 | 4 | 50 |  |  |  |  | nsHebrewProber *hebprober = new nsHebrewProber(); | 
| 61 |  |  |  |  |  |  | // Notice: Any change in these indexes - 10,11,12 must be reflected | 
| 62 |  |  |  |  |  |  | // in the code below as well. | 
| 63 | 4 |  |  |  |  |  | mProbers[10] = hebprober; | 
| 64 | 4 | 50 |  |  |  |  | mProbers[11] = new nsSingleByteCharSetProber(&Win1255Model, PR_FALSE, hebprober); // Logical Hebrew | 
| 65 | 4 | 50 |  |  |  |  | mProbers[12] = new nsSingleByteCharSetProber(&Win1255Model, PR_TRUE, hebprober); // Visual Hebrew | 
| 66 |  |  |  |  |  |  | // Tell the Hebrew prober about the logical and visual probers | 
| 67 | 4 | 50 |  |  |  |  | if (mProbers[10] && mProbers[11] && mProbers[12]) // all are not null | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
| 68 |  |  |  |  |  |  | { | 
| 69 |  |  |  |  |  |  | hebprober->SetModelProbers(mProbers[11], mProbers[12]); | 
| 70 |  |  |  |  |  |  | } | 
| 71 |  |  |  |  |  |  | else // One or more is null. avoid any Hebrew probing, null them all | 
| 72 |  |  |  |  |  |  | { | 
| 73 | 0 | 0 |  |  |  |  | for (PRUint32 i = 10; i <= 12; ++i) | 
| 74 |  |  |  |  |  |  | { | 
| 75 | 0 | 0 |  |  |  |  | delete mProbers[i]; | 
|  |  | 0 |  |  |  |  |  | 
| 76 | 0 |  |  |  |  |  | mProbers[i] = 0; | 
| 77 |  |  |  |  |  |  | } | 
| 78 |  |  |  |  |  |  | } | 
| 79 |  |  |  |  |  |  |  | 
| 80 |  |  |  |  |  |  | // disable latin2 before latin1 is available, otherwise all latin1 | 
| 81 |  |  |  |  |  |  | // will be detected as latin2 because of their similarity. | 
| 82 |  |  |  |  |  |  | //mProbers[10] = new nsSingleByteCharSetProber(&Latin2HungarianModel); | 
| 83 |  |  |  |  |  |  | //mProbers[11] = new nsSingleByteCharSetProber(&Win1250HungarianModel); | 
| 84 |  |  |  |  |  |  |  | 
| 85 | 4 | 50 |  |  |  |  | Reset(); | 
| 86 | 4 |  |  |  |  |  | } | 
| 87 |  |  |  |  |  |  |  | 
| 88 | 12 |  |  |  |  |  | nsSBCSGroupProber::~nsSBCSGroupProber() | 
| 89 |  |  |  |  |  |  | { | 
| 90 | 56 | 100 |  |  |  |  | for (PRUint32 i = 0; i < NUM_OF_SBCS_PROBERS; i++) | 
| 91 |  |  |  |  |  |  | { | 
| 92 | 52 | 50 |  |  |  |  | delete mProbers[i]; | 
|  |  | 50 |  |  |  |  |  | 
| 93 |  |  |  |  |  |  | } | 
| 94 | 8 |  |  |  |  |  | } | 
| 95 |  |  |  |  |  |  |  | 
| 96 |  |  |  |  |  |  |  | 
| 97 | 0 |  |  |  |  |  | const char* nsSBCSGroupProber::GetCharSetName() | 
| 98 |  |  |  |  |  |  | { | 
| 99 |  |  |  |  |  |  | //if we have no answer yet | 
| 100 | 0 | 0 |  |  |  |  | if (mBestGuess == -1) | 
| 101 |  |  |  |  |  |  | { | 
| 102 | 0 |  |  |  |  |  | GetConfidence(); | 
| 103 |  |  |  |  |  |  | //no charset seems positive | 
| 104 | 0 | 0 |  |  |  |  | if (mBestGuess == -1) | 
| 105 |  |  |  |  |  |  | //we will use default. | 
| 106 | 0 |  |  |  |  |  | mBestGuess = 0; | 
| 107 |  |  |  |  |  |  | } | 
| 108 | 0 |  |  |  |  |  | return mProbers[mBestGuess]->GetCharSetName(); | 
| 109 |  |  |  |  |  |  | } | 
| 110 |  |  |  |  |  |  |  | 
| 111 | 4 |  |  |  |  |  | void  nsSBCSGroupProber::Reset(void) | 
| 112 |  |  |  |  |  |  | { | 
| 113 | 4 |  |  |  |  |  | mActiveNum = 0; | 
| 114 | 56 | 100 |  |  |  |  | for (PRUint32 i = 0; i < NUM_OF_SBCS_PROBERS; i++) | 
| 115 |  |  |  |  |  |  | { | 
| 116 | 52 | 50 |  |  |  |  | if (mProbers[i]) // not null | 
| 117 |  |  |  |  |  |  | { | 
| 118 | 52 |  |  |  |  |  | mProbers[i]->Reset(); | 
| 119 | 52 |  |  |  |  |  | mIsActive[i] = PR_TRUE; | 
| 120 | 52 |  |  |  |  |  | ++mActiveNum; | 
| 121 |  |  |  |  |  |  | } | 
| 122 |  |  |  |  |  |  | else | 
| 123 | 0 |  |  |  |  |  | mIsActive[i] = PR_FALSE; | 
| 124 |  |  |  |  |  |  | } | 
| 125 | 4 |  |  |  |  |  | mBestGuess = -1; | 
| 126 | 4 |  |  |  |  |  | mState = eDetecting; | 
| 127 | 4 |  |  |  |  |  | } | 
| 128 |  |  |  |  |  |  |  | 
| 129 |  |  |  |  |  |  |  | 
| 130 | 4 |  |  |  |  |  | nsProbingState nsSBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen) | 
| 131 |  |  |  |  |  |  | { | 
| 132 |  |  |  |  |  |  | nsProbingState st; | 
| 133 |  |  |  |  |  |  | PRUint32 i; | 
| 134 | 4 |  |  |  |  |  | char *newBuf1 = 0; | 
| 135 | 4 |  |  |  |  |  | PRUint32 newLen1 = 0; | 
| 136 |  |  |  |  |  |  |  | 
| 137 |  |  |  |  |  |  | //apply filter to original buffer, and we got new buffer back | 
| 138 |  |  |  |  |  |  | //depend on what script it is, we will feed them the new buffer | 
| 139 |  |  |  |  |  |  | //we got after applying proper filter | 
| 140 |  |  |  |  |  |  | //this is done without any consideration to KeepEnglishLetters | 
| 141 |  |  |  |  |  |  | //of each prober since as of now, there are no probers here which | 
| 142 |  |  |  |  |  |  | //recognize languages with English characters. | 
| 143 | 4 | 50 |  |  |  |  | if (!FilterWithoutEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) | 
| 144 |  |  |  |  |  |  | goto done; | 
| 145 |  |  |  |  |  |  |  | 
| 146 | 4 | 50 |  |  |  |  | if (newLen1 == 0) | 
| 147 |  |  |  |  |  |  | goto done; // Nothing to see here, move on. | 
| 148 |  |  |  |  |  |  |  | 
| 149 | 56 | 100 |  |  |  |  | for (i = 0; i < NUM_OF_SBCS_PROBERS; i++) | 
| 150 |  |  |  |  |  |  | { | 
| 151 | 52 | 50 |  |  |  |  | if (!mIsActive[i]) | 
| 152 |  |  |  |  |  |  | continue; | 
| 153 | 52 |  |  |  |  |  | st = mProbers[i]->HandleData(newBuf1, newLen1); | 
| 154 | 52 | 50 |  |  |  |  | if (st == eFoundIt) | 
| 155 |  |  |  |  |  |  | { | 
| 156 | 0 |  |  |  |  |  | mBestGuess = i; | 
| 157 | 0 |  |  |  |  |  | mState = eFoundIt; | 
| 158 | 0 |  |  |  |  |  | break; | 
| 159 |  |  |  |  |  |  | } | 
| 160 | 52 | 50 |  |  |  |  | else if (st == eNotMe) | 
| 161 |  |  |  |  |  |  | { | 
| 162 | 0 |  |  |  |  |  | mIsActive[i] = PR_FALSE; | 
| 163 | 0 |  |  |  |  |  | mActiveNum--; | 
| 164 | 0 | 0 |  |  |  |  | if (mActiveNum <= 0) | 
| 165 |  |  |  |  |  |  | { | 
| 166 | 0 |  |  |  |  |  | mState = eNotMe; | 
| 167 | 0 |  |  |  |  |  | break; | 
| 168 |  |  |  |  |  |  | } | 
| 169 |  |  |  |  |  |  | } | 
| 170 |  |  |  |  |  |  | } | 
| 171 |  |  |  |  |  |  |  | 
| 172 |  |  |  |  |  |  | done: | 
| 173 | 4 | 50 |  |  |  |  | PR_FREEIF(newBuf1); | 
| 174 |  |  |  |  |  |  |  | 
| 175 | 4 |  |  |  |  |  | return mState; | 
| 176 |  |  |  |  |  |  | } | 
| 177 |  |  |  |  |  |  |  | 
| 178 | 4 |  |  |  |  |  | float nsSBCSGroupProber::GetConfidence(void) | 
| 179 |  |  |  |  |  |  | { | 
| 180 |  |  |  |  |  |  | PRUint32 i; | 
| 181 |  |  |  |  |  |  | float bestConf = 0.0, cf; | 
| 182 |  |  |  |  |  |  |  | 
| 183 | 4 |  |  |  |  |  | switch (mState) | 
| 184 |  |  |  |  |  |  | { | 
| 185 |  |  |  |  |  |  | case eFoundIt: | 
| 186 |  |  |  |  |  |  | return (float)0.99; //sure yes | 
| 187 |  |  |  |  |  |  | case eNotMe: | 
| 188 | 0 |  |  |  |  |  | return (float)0.01;  //sure no | 
| 189 |  |  |  |  |  |  | default: | 
| 190 | 56 | 100 |  |  |  |  | for (i = 0; i < NUM_OF_SBCS_PROBERS; i++) | 
| 191 |  |  |  |  |  |  | { | 
| 192 | 52 | 50 |  |  |  |  | if (!mIsActive[i]) | 
| 193 |  |  |  |  |  |  | continue; | 
| 194 | 52 |  |  |  |  |  | cf = mProbers[i]->GetConfidence(); | 
| 195 | 52 | 100 |  |  |  |  | if (bestConf < cf) | 
| 196 |  |  |  |  |  |  | { | 
| 197 |  |  |  |  |  |  | bestConf = cf; | 
| 198 | 4 |  |  |  |  |  | mBestGuess = i; | 
| 199 |  |  |  |  |  |  | } | 
| 200 |  |  |  |  |  |  | } | 
| 201 |  |  |  |  |  |  | } | 
| 202 |  |  |  |  |  |  | return bestConf; | 
| 203 |  |  |  |  |  |  | } | 
| 204 |  |  |  |  |  |  |  | 
| 205 |  |  |  |  |  |  | #ifdef DEBUG_chardet | 
| 206 |  |  |  |  |  |  | void nsSBCSGroupProber::DumpStatus() | 
| 207 |  |  |  |  |  |  | { | 
| 208 |  |  |  |  |  |  | PRUint32 i; | 
| 209 |  |  |  |  |  |  | float cf; | 
| 210 |  |  |  |  |  |  |  | 
| 211 |  |  |  |  |  |  | cf = GetConfidence(); | 
| 212 |  |  |  |  |  |  | printf(" SBCS Group Prober --------begin status \r\n"); | 
| 213 |  |  |  |  |  |  | for (i = 0; i < NUM_OF_SBCS_PROBERS; i++) | 
| 214 |  |  |  |  |  |  | { | 
| 215 |  |  |  |  |  |  | if (!mIsActive[i]) | 
| 216 |  |  |  |  |  |  | printf("  inactive: [%s] (i.e. confidence is too low).\r\n", mProbers[i]->GetCharSetName()); | 
| 217 |  |  |  |  |  |  | else | 
| 218 |  |  |  |  |  |  | mProbers[i]->DumpStatus(); | 
| 219 |  |  |  |  |  |  | } | 
| 220 |  |  |  |  |  |  | printf(" SBCS Group found best match [%s] confidence %f.\r\n", | 
| 221 |  |  |  |  |  |  | mProbers[mBestGuess]->GetCharSetName(), cf); | 
| 222 |  |  |  |  |  |  | } | 
| 223 |  |  |  |  |  |  | #endif |