File Coverage

blib/lib/Text/TinySegmenter.pm
Criterion Covered Total %
statement 103 105 98.1
branch 8 12 66.6
condition 1 3 33.3
subroutine 7 7 100.0
pod 1 1 100.0
total 120 128 93.7


line stmt bran cond sub pod time code
1             package Text::TinySegmenter;
2 2     2   122314 use 5.8.1;
  2         9  
  2         107  
3 2     2   12 use strict;
  2         5  
  2         77  
4 2     2   12 use warnings;
  2         9  
  2         102  
5 2     2   3551 use utf8;
  2         13  
  2         23  
6              
7             our $VERSION = '0.01';
8              
9             my %Patterns = (
10             "[一二三四五六七八九十百千万億兆]" => "M",
11             "[一-龠々〆ヵヶ]" => "H",
12             "[ぁ-ん]" => "I",
13             "[ァ-ヴーア-ン゙ー]" => "K",
14             "[a-zA-Za-zA-Z]" => "A",
15             "[0-90-9]" => "N",
16             );
17              
18             my @CharType;
19              
20             {
21             while (my ($key, $val) = each %Patterns) {
22             push @CharType, [qr/$key/, $val];
23             }
24             }
25              
26             my $BIAS = -332;
27             my %BC1 = ("HH" => 6,"II" => 2461,"KH" => 406,"OH" => -1378);
28             my %BC2 = ("AA" => -3267,"AI" => 2744,"AN" => -878,"HH" => -4070,"HM" => -1711,"HN" => 4012,"HO" => 3761,"IA" => 1327,"IH" => -1184,"II" => -1332,"IK" => 1721,"IO" => 5492,"KI" => 3831,"KK" => -8741,"MH" => -3132,"MK" => 3334,"OO" => -2920);
29             my %BC3 = ("HH" => 996,"HI" => 626,"HK" => -721,"HN" => -1307,"HO" => -836,"IH" => -301,"KK" => 2762,"MK" => 1079,"MM" => 4034,"OA" => -1652,"OH" => 266);
30             my %BP1 = ("BB" => 295,"OB" => 304,"OO" => -125,"UB" => 352);
31             my %BP2 = ("BO" => 60,"OO" => -1762);
32             my %BQ1 = ("BHH" => 1150,"BHM" => 1521,"BII" => -1158,"BIM" => 886,"BMH" => 1208,"BNH" => 449,"BOH" => -91,"BOO" => -2597,"OHI" => 451,"OIH" => -296,"OKA" => 1851,"OKH" => -1020,"OKK" => 904,"OOO" => 2965);
33             my %BQ2 = ("BHH" => 118,"BHI" => -1159,"BHM" => 466,"BIH" => -919,"BKK" => -1720,"BKO" => 864,"OHH" => -1139,"OHM" => -181,"OIH" => 153,"UHI" => -1146);
34             my %BQ3 = ("BHH" => -792,"BHI" => 2664,"BII" => -299,"BKI" => 419,"BMH" => 937,"BMM" => 8335,"BNN" => 998,"BOH" => 775,"OHH" => 2174,"OHM" => 439,"OII" => 280,"OKH" => 1798,"OKI" => -793,"OKO" => -2242,"OMH" => -2402,"OOO" => 11699);
35             my %BQ4 = ("BHH" => -3895,"BIH" => 3761,"BII" => -4654,"BIK" => 1348,"BKK" => -1806,"BMI" => -3385,"BOO" => -12396,"OAH" => 926,"OHH" => 266,"OHK" => -2036,"ONN" => -973);
36             my %BW1 = (",と" => 660,",同" => 727,"B1あ" => 1404,"B1同" => 542,"、と" => 660,"、同" => 727,"」と" => 1682,"あっ" => 1505,"いう" => 1743,"いっ" => -2055,"いる" => 672,"うし" => -4817,"うん" => 665,"から" => 3472,"がら" => 600,"こう" => -790,"こと" => 2083,"こん" => -1262,"さら" => -4143,"さん" => 4573,"した" => 2641,"して" => 1104,"すで" => -3399,"そこ" => 1977,"それ" => -871,"たち" => 1122,"ため" => 601,"った" => 3463,"つい" => -802,"てい" => 805,"てき" => 1249,"でき" => 1127,"です" => 3445,"では" => 844,"とい" => -4915,"とみ" => 1922,"どこ" => 3887,"ない" => 5713,"なっ" => 3015,"など" => 7379,"なん" => -1113,"にし" => 2468,"には" => 1498,"にも" => 1671,"に対" => -912,"の一" => -501,"の中" => 741,"ませ" => 2448,"まで" => 1711,"まま" => 2600,"まる" => -2155,"やむ" => -1947,"よっ" => -2565,"れた" => 2369,"れで" => -913,"をし" => 1860,"を見" => 731,"亡く" => -1886,"京都" => 2558,"取り" => -2784,"大き" => -2604,"大阪" => 1497,"平方" => -2314,"引き" => -1336,"日本" => -195,"本当" => -2423,"毎日" => -2113,"目指" => -724,"B1あ" => 1404,"B1同" => 542,"」と" => 1682);
37             my %BW2 = (".." => -11822,"11" => -669,"――" => -5730,"−−" => -13175,"いう" => -1609,"うか" => 2490,"かし" => -1350,"かも" => -602,"から" => -7194,"かれ" => 4612,"がい" => 853,"がら" => -3198,"きた" => 1941,"くな" => -1597,"こと" => -8392,"この" => -4193,"させ" => 4533,"され" => 13168,"さん" => -3977,"しい" => -1819,"しか" => -545,"した" => 5078,"して" => 972,"しな" => 939,"その" => -3744,"たい" => -1253,"たた" => -662,"ただ" => -3857,"たち" => -786,"たと" => 1224,"たは" => -939,"った" => 4589,"って" => 1647,"っと" => -2094,"てい" => 6144,"てき" => 3640,"てく" => 2551,"ては" => -3110,"ても" => -3065,"でい" => 2666,"でき" => -1528,"でし" => -3828,"です" => -4761,"でも" => -4203,"とい" => 1890,"とこ" => -1746,"とと" => -2279,"との" => 720,"とみ" => 5168,"とも" => -3941,"ない" => -2488,"なが" => -1313,"など" => -6509,"なの" => 2614,"なん" => 3099,"にお" => -1615,"にし" => 2748,"にな" => 2454,"によ" => -7236,"に対" => -14943,"に従" => -4688,"に関" => -11388,"のか" => 2093,"ので" => -7059,"のに" => -6041,"のの" => -6125,"はい" => 1073,"はが" => -1033,"はず" => -2532,"ばれ" => 1813,"まし" => -1316,"まで" => -6621,"まれ" => 5409,"めて" => -3153,"もい" => 2230,"もの" => -10713,"らか" => -944,"らし" => -1611,"らに" => -1897,"りし" => 651,"りま" => 1620,"れた" => 4270,"れて" => 849,"れば" => 4114,"ろう" => 6067,"われ" => 7901,"を通" => -11877,"んだ" => 728,"んな" => -4115,"一人" => 602,"一方" => -1375,"一日" => 970,"一部" => -1051,"上が" => -4479,"会社" => -1116,"出て" => 2163,"分の" => -7758,"同党" => 970,"同日" => -913,"大阪" => -2471,"委員" => -1250,"少な" => -1050,"年度" => -8669,"年間" => -1626,"府県" => -2363,"手権" => -1982,"新聞" => -4066,"日新" => -722,"日本" => -7068,"日米" => 3372,"曜日" => -601,"朝鮮" => -2355,"本人" => -2697,"東京" => -1543,"然と" => -1384,"社会" => -1276,"立て" => -990,"第に" => -1612,"米国" => -4268,"11" => -669);
38             my %BW3 = ("あた" => -2194,"あり" => 719,"ある" => 3846,"い." => -1185,"い。" => -1185,"いい" => 5308,"いえ" => 2079,"いく" => 3029,"いた" => 2056,"いっ" => 1883,"いる" => 5600,"いわ" => 1527,"うち" => 1117,"うと" => 4798,"えと" => 1454,"か." => 2857,"か。" => 2857,"かけ" => -743,"かっ" => -4098,"かに" => -669,"から" => 6520,"かり" => -2670,"が," => 1816,"が、" => 1816,"がき" => -4855,"がけ" => -1127,"がっ" => -913,"がら" => -4977,"がり" => -2064,"きた" => 1645,"けど" => 1374,"こと" => 7397,"この" => 1542,"ころ" => -2757,"さい" => -714,"さを" => 976,"し," => 1557,"し、" => 1557,"しい" => -3714,"した" => 3562,"して" => 1449,"しな" => 2608,"しま" => 1200,"す." => -1310,"す。" => -1310,"する" => 6521,"ず," => 3426,"ず、" => 3426,"ずに" => 841,"そう" => 428,"た." => 8875,"た。" => 8875,"たい" => -594,"たの" => 812,"たり" => -1183,"たる" => -853,"だ." => 4098,"だ。" => 4098,"だっ" => 1004,"った" => -4748,"って" => 300,"てい" => 6240,"てお" => 855,"ても" => 302,"です" => 1437,"でに" => -1482,"では" => 2295,"とう" => -1387,"とし" => 2266,"との" => 541,"とも" => -3543,"どう" => 4664,"ない" => 1796,"なく" => -903,"など" => 2135,"に," => -1021,"に、" => -1021,"にし" => 1771,"にな" => 1906,"には" => 2644,"の," => -724,"の、" => -724,"の子" => -1000,"は," => 1337,"は、" => 1337,"べき" => 2181,"まし" => 1113,"ます" => 6943,"まっ" => -1549,"まで" => 6154,"まれ" => -793,"らし" => 1479,"られ" => 6820,"るる" => 3818,"れ," => 854,"れ、" => 854,"れた" => 1850,"れて" => 1375,"れば" => -3246,"れる" => 1091,"われ" => -605,"んだ" => 606,"んで" => 798,"カ月" => 990,"会議" => 860,"入り" => 1232,"大会" => 2217,"始め" => 1681,"市" => 965,"新聞" => -5055,"日," => 974,"日、" => 974,"社会" => 2024,"カ月" => 990);
39             my %TC1 = ("AAA" => 1093,"HHH" => 1029,"HHM" => 580,"HII" => 998,"HOH" => -390,"HOM" => -331,"IHI" => 1169,"IOH" => -142,"IOI" => -1015,"IOM" => 467,"MMH" => 187,"OOI" => -1832);
40             my %TC2 = ("HHO" => 2088,"HII" => -1023,"HMM" => -1154,"IHI" => -1965,"KKH" => 703,"OII" => -2649);
41             my %TC3 = ("AAA" => -294,"HHH" => 346,"HHI" => -341,"HII" => -1088,"HIK" => 731,"HOH" => -1486,"IHH" => 128,"IHI" => -3041,"IHO" => -1935,"IIH" => -825,"IIM" => -1035,"IOI" => -542,"KHH" => -1216,"KKA" => 491,"KKH" => -1217,"KOK" => -1009,"MHH" => -2694,"MHM" => -457,"MHO" => 123,"MMH" => -471,"NNH" => -1689,"NNO" => 662,"OHO" => -3393);
42             my %TC4 = ("HHH" => -203,"HHI" => 1344,"HHK" => 365,"HHM" => -122,"HHN" => 182,"HHO" => 669,"HIH" => 804,"HII" => 679,"HOH" => 446,"IHH" => 695,"IHO" => -2324,"IIH" => 321,"III" => 1497,"IIO" => 656,"IOO" => 54,"KAK" => 4845,"KKA" => 3386,"KKK" => 3065,"MHH" => -405,"MHI" => 201,"MMH" => -241,"MMM" => 661,"MOM" => 841);
43             my %TQ1 = ("BHHH" => -227,"BHHI" => 316,"BHIH" => -132,"BIHH" => 60,"BIII" => 1595,"BNHH" => -744,"BOHH" => 225,"BOOO" => -908,"OAKK" => 482,"OHHH" => 281,"OHIH" => 249,"OIHI" => 200,"OIIH" => -68);
44             my %TQ2 = ("BIHH" => -1401,"BIII" => -1033,"BKAK" => -543,"BOOO" => -5591);
45             my %TQ3 = ("BHHH" => 478,"BHHM" => -1073,"BHIH" => 222,"BHII" => -504,"BIIH" => -116,"BIII" => -105,"BMHI" => -863,"BMHM" => -464,"BOMH" => 620,"OHHH" => 346,"OHHI" => 1729,"OHII" => 997,"OHMH" => 481,"OIHH" => 623,"OIIH" => 1344,"OKAK" => 2792,"OKHH" => 587,"OKKA" => 679,"OOHH" => 110,"OOII" => -685);
46             my %TQ4 = ("BHHH" => -721,"BHHM" => -3604,"BHII" => -966,"BIIH" => -607,"BIII" => -2181,"OAAA" => -2763,"OAKK" => 180,"OHHH" => -294,"OHHI" => 2446,"OHHO" => 480,"OHIH" => -1573,"OIHH" => 1935,"OIHI" => -493,"OIIH" => 626,"OIII" => -4007,"OKAK" => -8156);
47             my %TW1 = ("につい" => -4681,"東京都" => 2026);
48             my %TW2 = ("ある程" => -2049,"いった" => -1256,"ころが" => -2434,"しょう" => 3873,"その後" => -4430,"だって" => -1049,"ていた" => 1833,"として" => -4657,"ともに" => -4517,"もので" => 1882,"一気に" => -792,"初めて" => -1512,"同時に" => -8097,"大きな" => -1255,"対して" => -2721,"社会党" => -3216);
49             my %TW3 = ("いただ" => -1734,"してい" => 1314,"として" => -4314,"につい" => -5483,"にとっ" => -5989,"に当た" => -6247,"ので," => -727,"ので、" => -727,"のもの" => -600,"れから" => -3752,"十二月" => -2287);
50             my %TW4 = ("いう." => 8576,"いう。" => 8576,"からな" => -2348,"してい" => 2958,"たが," => 1516,"たが、" => 1516,"ている" => 1538,"という" => 1349,"ました" => 5543,"ません" => 1097,"ようと" => -4258,"よると" => 5865);
51             my %UC1 = ("A" => 484,"K" => 93,"M" => 645,"O" => -505);
52             my %UC2 = ("A" => 819,"H" => 1059,"I" => 409,"M" => 3987,"N" => 5775,"O" => 646);
53             my %UC3 = ("A" => -1370,"I" => 2311);
54             my %UC4 = ("A" => -2643,"H" => 1809,"I" => -1032,"K" => -3450,"M" => 3565,"N" => 3876,"O" => 6646);
55             my %UC5 = ("H" => 313,"I" => -1238,"K" => -799,"M" => 539,"O" => -831);
56             my %UC6 = ("H" => -506,"I" => -253,"K" => 87,"M" => 247,"O" => -387);
57             my %UP1 = ("O" => -214);
58             my %UP2 = ("B" => 69,"O" => 935);
59             my %UP3 = ("B" => 189);
60             my %UQ1 = ("BH" => 21,"BI" => -12,"BK" => -99,"BN" => 142,"BO" => -56,"OH" => -95,"OI" => 477,"OK" => 410,"OO" => -2422);
61             my %UQ2 = ("BH" => 216,"BI" => 113,"OK" => 1759);
62             my %UQ3 = ("BA" => -479,"BH" => 42,"BI" => 1913,"BK" => -7198,"BM" => 3160,"BN" => 6427,"BO" => 14761,"OI" => -827,"ON" => -3212);
63             my %UW1 = ("," => 156,"、" => 156,"「" => -463,"あ" => -941,"う" => -127,"が" => -553,"き" => 121,"こ" => 505,"で" => -201,"と" => -547,"ど" => -123,"に" => -789,"の" => -185,"は" => -847,"も" => -466,"や" => -470,"よ" => 182,"ら" => -292,"り" => 208,"れ" => 169,"を" => -446,"ん" => -137,"・" => -135,"主" => -402,"京" => -268,"区" => -912,"午" => 871,"国" => -460,"大" => 561,"委" => 729,"市" => -411,"日" => -141,"理" => 361,"生" => -408,"県" => -386,"都" => -718,"「" => -463,"・" => -135);
64             my %UW2 = ("," => -829,"、" => -829,"〇" => 892,"「" => -645,"」" => 3145,"あ" => -538,"い" => 505,"う" => 134,"お" => -502,"か" => 1454,"が" => -856,"く" => -412,"こ" => 1141,"さ" => 878,"ざ" => 540,"し" => 1529,"す" => -675,"せ" => 300,"そ" => -1011,"た" => 188,"だ" => 1837,"つ" => -949,"て" => -291,"で" => -268,"と" => -981,"ど" => 1273,"な" => 1063,"に" => -1764,"の" => 130,"は" => -409,"ひ" => -1273,"べ" => 1261,"ま" => 600,"も" => -1263,"や" => -402,"よ" => 1639,"り" => -579,"る" => -694,"れ" => 571,"を" => -2516,"ん" => 2095,"ア" => -587,"カ" => 306,"キ" => 568,"ッ" => 831,"三" => -758,"不" => -2150,"世" => -302,"中" => -968,"主" => -861,"事" => 492,"人" => -123,"会" => 978,"保" => 362,"入" => 548,"初" => -3025,"副" => -1566,"北" => -3414,"区" => -422,"大" => -1769,"天" => -865,"太" => -483,"子" => -1519,"学" => 760,"実" => 1023,"小" => -2009,"市" => -813,"年" => -1060,"強" => 1067,"手" => -1519,"揺" => -1033,"政" => 1522,"文" => -1355,"新" => -1682,"日" => -1815,"明" => -1462,"最" => -630,"朝" => -1843,"本" => -1650,"東" => -931,"果" => -665,"次" => -2378,"民" => -180,"気" => -1740,"理" => 752,"発" => 529,"目" => -1584,"相" => -242,"県" => -1165,"立" => -763,"第" => 810,"米" => 509,"自" => -1353,"行" => 838,"西" => -744,"見" => -3874,"調" => 1010,"議" => 1198,"込" => 3041,"開" => 1758,"間" => -1257,"「" => -645,"」" => 3145,"ッ" => 831,"ア" => -587,"カ" => 306,"キ" => 568);
65             my %UW3 = ("," => 4889,"1" => -800,"−" => -1723,"、" => 4889,"々" => -2311,"〇" => 5827,"」" => 2670,"〓" => -3573,"あ" => -2696,"い" => 1006,"う" => 2342,"え" => 1983,"お" => -4864,"か" => -1163,"が" => 3271,"く" => 1004,"け" => 388,"げ" => 401,"こ" => -3552,"ご" => -3116,"さ" => -1058,"し" => -395,"す" => 584,"せ" => 3685,"そ" => -5228,"た" => 842,"ち" => -521,"っ" => -1444,"つ" => -1081,"て" => 6167,"で" => 2318,"と" => 1691,"ど" => -899,"な" => -2788,"に" => 2745,"の" => 4056,"は" => 4555,"ひ" => -2171,"ふ" => -1798,"へ" => 1199,"ほ" => -5516,"ま" => -4384,"み" => -120,"め" => 1205,"も" => 2323,"や" => -788,"よ" => -202,"ら" => 727,"り" => 649,"る" => 5905,"れ" => 2773,"わ" => -1207,"を" => 6620,"ん" => -518,"ア" => 551,"グ" => 1319,"ス" => 874,"ッ" => -1350,"ト" => 521,"ム" => 1109,"ル" => 1591,"ロ" => 2201,"ン" => 278,"・" => -3794,"一" => -1619,"下" => -1759,"世" => -2087,"両" => 3815,"中" => 653,"主" => -758,"予" => -1193,"二" => 974,"人" => 2742,"今" => 792,"他" => 1889,"以" => -1368,"低" => 811,"何" => 4265,"作" => -361,"保" => -2439,"元" => 4858,"党" => 3593,"全" => 1574,"公" => -3030,"六" => 755,"共" => -1880,"円" => 5807,"再" => 3095,"分" => 457,"初" => 2475,"別" => 1129,"前" => 2286,"副" => 4437,"力" => 365,"動" => -949,"務" => -1872,"化" => 1327,"北" => -1038,"区" => 4646,"千" => -2309,"午" => -783,"協" => -1006,"口" => 483,"右" => 1233,"各" => 3588,"合" => -241,"同" => 3906,"和" => -837,"員" => 4513,"国" => 642,"型" => 1389,"場" => 1219,"外" => -241,"妻" => 2016,"学" => -1356,"安" => -423,"実" => -1008,"家" => 1078,"小" => -513,"少" => -3102,"州" => 1155,"市" => 3197,"平" => -1804,"年" => 2416,"広" => -1030,"府" => 1605,"度" => 1452,"建" => -2352,"当" => -3885,"得" => 1905,"思" => -1291,"性" => 1822,"戸" => -488,"指" => -3973,"政" => -2013,"教" => -1479,"数" => 3222,"文" => -1489,"新" => 1764,"日" => 2099,"旧" => 5792,"昨" => -661,"時" => -1248,"曜" => -951,"最" => -937,"月" => 4125,"期" => 360,"李" => 3094,"村" => 364,"東" => -805,"核" => 5156,"森" => 2438,"業" => 484,"氏" => 2613,"民" => -1694,"決" => -1073,"法" => 1868,"海" => -495,"無" => 979,"物" => 461,"特" => -3850,"生" => -273,"用" => 914,"町" => 1215,"的" => 7313,"直" => -1835,"省" => 792,"県" => 6293,"知" => -1528,"私" => 4231,"税" => 401,"立" => -960,"第" => 1201,"米" => 7767,"系" => 3066,"約" => 3663,"級" => 1384,"統" => -4229,"総" => 1163,"線" => 1255,"者" => 6457,"能" => 725,"自" => -2869,"英" => 785,"見" => 1044,"調" => -562,"財" => -733,"費" => 1777,"車" => 1835,"軍" => 1375,"込" => -1504,"通" => -1136,"選" => -681,"郎" => 1026,"郡" => 4404,"部" => 1200,"金" => 2163,"長" => 421,"開" => -1432,"間" => 1302,"関" => -1282,"雨" => 2009,"電" => -1045,"非" => 2066,"駅" => 1620,"1" => -800,"」" => 2670,"・" => -3794,"ッ" => -1350,"ア" => 551,"グ" => 1319,"ス" => 874,"ト" => 521,"ム" => 1109,"ル" => 1591,"ロ" => 2201,"ン" => 278);
66             my %UW4 = ("," => 3930,"." => 3508,"―" => -4841,"、" => 3930,"。" => 3508,"〇" => 4999,"「" => 1895,"」" => 3798,"〓" => -5156,"あ" => 4752,"い" => -3435,"う" => -640,"え" => -2514,"お" => 2405,"か" => 530,"が" => 6006,"き" => -4482,"ぎ" => -3821,"く" => -3788,"け" => -4376,"げ" => -4734,"こ" => 2255,"ご" => 1979,"さ" => 2864,"し" => -843,"じ" => -2506,"す" => -731,"ず" => 1251,"せ" => 181,"そ" => 4091,"た" => 5034,"だ" => 5408,"ち" => -3654,"っ" => -5882,"つ" => -1659,"て" => 3994,"で" => 7410,"と" => 4547,"な" => 5433,"に" => 6499,"ぬ" => 1853,"ね" => 1413,"の" => 7396,"は" => 8578,"ば" => 1940,"ひ" => 4249,"び" => -4134,"ふ" => 1345,"へ" => 6665,"べ" => -744,"ほ" => 1464,"ま" => 1051,"み" => -2082,"む" => -882,"め" => -5046,"も" => 4169,"ゃ" => -2666,"や" => 2795,"ょ" => -1544,"よ" => 3351,"ら" => -2922,"り" => -9726,"る" => -14896,"れ" => -2613,"ろ" => -4570,"わ" => -1783,"を" => 13150,"ん" => -2352,"カ" => 2145,"コ" => 1789,"セ" => 1287,"ッ" => -724,"ト" => -403,"メ" => -1635,"ラ" => -881,"リ" => -541,"ル" => -856,"ン" => -3637,"・" => -4371,"ー" => -11870,"一" => -2069,"中" => 2210,"予" => 782,"事" => -190,"井" => -1768,"人" => 1036,"以" => 544,"会" => 950,"体" => -1286,"作" => 530,"側" => 4292,"先" => 601,"党" => -2006,"共" => -1212,"内" => 584,"円" => 788,"初" => 1347,"前" => 1623,"副" => 3879,"力" => -302,"動" => -740,"務" => -2715,"化" => 776,"区" => 4517,"協" => 1013,"参" => 1555,"合" => -1834,"和" => -681,"員" => -910,"器" => -851,"回" => 1500,"国" => -619,"園" => -1200,"地" => 866,"場" => -1410,"塁" => -2094,"士" => -1413,"多" => 1067,"大" => 571,"子" => -4802,"学" => -1397,"定" => -1057,"寺" => -809,"小" => 1910,"屋" => -1328,"山" => -1500,"島" => -2056,"川" => -2667,"市" => 2771,"年" => 374,"庁" => -4556,"後" => 456,"性" => 553,"感" => 916,"所" => -1566,"支" => 856,"改" => 787,"政" => 2182,"教" => 704,"文" => 522,"方" => -856,"日" => 1798,"時" => 1829,"最" => 845,"月" => -9066,"木" => -485,"来" => -442,"校" => -360,"業" => -1043,"氏" => 5388,"民" => -2716,"気" => -910,"沢" => -939,"済" => -543,"物" => -735,"率" => 672,"球" => -1267,"生" => -1286,"産" => -1101,"田" => -2900,"町" => 1826,"的" => 2586,"目" => 922,"省" => -3485,"県" => 2997,"空" => -867,"立" => -2112,"第" => 788,"米" => 2937,"系" => 786,"約" => 2171,"経" => 1146,"統" => -1169,"総" => 940,"線" => -994,"署" => 749,"者" => 2145,"能" => -730,"般" => -852,"行" => -792,"規" => 792,"警" => -1184,"議" => -244,"谷" => -1000,"賞" => 730,"車" => -1481,"軍" => 1158,"輪" => -1433,"込" => -3370,"近" => 929,"道" => -1291,"選" => 2596,"郎" => -4866,"都" => 1192,"野" => -1100,"銀" => -2213,"長" => 357,"間" => -2344,"院" => -2297,"際" => -2604,"電" => -878,"領" => -1659,"題" => -792,"館" => -1984,"首" => 1749,"高" => 2120,"「" => 1895,"」" => 3798,"・" => -4371,"ッ" => -724,"ー" => -11870,"カ" => 2145,"コ" => 1789,"セ" => 1287,"ト" => -403,"メ" => -1635,"ラ" => -881,"リ" => -541,"ル" => -856,"ン" => -3637);
67             my %UW5 = ("," => 465,"." => -299,"1" => -514,"E2" => -32768,"]" => -2762,"、" => 465,"。" => -299,"「" => 363,"あ" => 1655,"い" => 331,"う" => -503,"え" => 1199,"お" => 527,"か" => 647,"が" => -421,"き" => 1624,"ぎ" => 1971,"く" => 312,"げ" => -983,"さ" => -1537,"し" => -1371,"す" => -852,"だ" => -1186,"ち" => 1093,"っ" => 52,"つ" => 921,"て" => -18,"で" => -850,"と" => -127,"ど" => 1682,"な" => -787,"に" => -1224,"の" => -635,"は" => -578,"べ" => 1001,"み" => 502,"め" => 865,"ゃ" => 3350,"ょ" => 854,"り" => -208,"る" => 429,"れ" => 504,"わ" => 419,"を" => -1264,"ん" => 327,"イ" => 241,"ル" => 451,"ン" => -343,"中" => -871,"京" => 722,"会" => -1153,"党" => -654,"務" => 3519,"区" => -901,"告" => 848,"員" => 2104,"大" => -1296,"学" => -548,"定" => 1785,"嵐" => -1304,"市" => -2991,"席" => 921,"年" => 1763,"思" => 872,"所" => -814,"挙" => 1618,"新" => -1682,"日" => 218,"月" => -4353,"査" => 932,"格" => 1356,"機" => -1508,"氏" => -1347,"田" => 240,"町" => -3912,"的" => -3149,"相" => 1319,"省" => -1052,"県" => -4003,"研" => -997,"社" => -278,"空" => -813,"統" => 1955,"者" => -2233,"表" => 663,"語" => -1073,"議" => 1219,"選" => -1018,"郎" => -368,"長" => 786,"間" => 1191,"題" => 2368,"館" => -689,"1" => -514,"E2" => -32768,"「" => 363,"イ" => 241,"ル" => 451,"ン" => -343);
68             my %UW6 = ("," => 227,"." => 808,"1" => -270,"E1" => 306,"、" => 227,"。" => 808,"あ" => -307,"う" => 189,"か" => 241,"が" => -73,"く" => -121,"こ" => -200,"じ" => 1782,"す" => 383,"た" => -428,"っ" => 573,"て" => -1014,"で" => 101,"と" => -105,"な" => -253,"に" => -149,"の" => -417,"は" => -236,"も" => -206,"り" => 187,"る" => -135,"を" => 195,"ル" => -673,"ン" => -496,"一" => -277,"中" => 201,"件" => -800,"会" => 624,"前" => 302,"区" => 1792,"員" => -1212,"委" => 798,"学" => -960,"市" => 887,"広" => -695,"後" => 535,"業" => -697,"相" => 753,"社" => -507,"福" => 974,"空" => -822,"者" => 1811,"連" => 463,"郎" => 1082,"1" => -270,"E1" => 306,"ル" => -673,"ン" => -496);
69              
70             sub _ctype {
71 9     9   12 my $str = shift;
72 9         11 for my $type (@CharType) {
73 28 100       125 if ($str =~ $type->[0]) {
74 9         28 return $type->[1];
75             }
76             }
77 0         0 return "O";
78             }
79              
80             sub _ts {
81 336 100   336   1007 $_[0] || 0;
82             }
83              
84             sub segment {
85 1     1 1 14 my ($class, $input) = @_;
86 1 50 33     11 if (!defined $input || $input eq '') {
87 0 0       0 return wantarray ? () : [];
88             }
89 1         2 my @result;
90 1         3 my @seg = ("B3","B2","B1");
91 1         4 my @ctype = ("O","O","O");
92 1         8 my @o = split //, $input;
93 1         3 for my $c (@o) {
94 9         14 push @seg, $c;
95 9         15 push @ctype, _ctype($c);
96             }
97 1         4 push @seg, "E1";
98 1         2 push @seg, "E2";
99 1         2 push @seg, "E3";
100 1         2 push @ctype, "O";
101 1         1 push @ctype, "O";
102 1         2 push @ctype, "O";
103 1         3 my $word = $seg[3];
104 1         1 my $p1 = "U";
105 1         2 my $p2 = "U";
106 1         2 my $p3 = "U";
107 1         9 for (my $i = 4; $i < @seg - 3; ++$i) {
108 8         8 my $score = $BIAS;
109 8         14 my $w1 = $seg[$i-3];
110 8         9 my $w2 = $seg[$i-2];
111 8         9 my $w3 = $seg[$i-1];
112 8         11 my $w4 = $seg[$i];
113 8         12 my $w5 = $seg[$i+1];
114 8         10 my $w6 = $seg[$i+2];
115 8         11 my $c1 = $ctype[$i-3];
116 8         9 my $c2 = $ctype[$i-2];
117 8         9 my $c3 = $ctype[$i-1];
118 8         9 my $c4 = $ctype[$i];
119 8         8 my $c5 = $ctype[$i+1];
120 8         10 my $c6 = $ctype[$i+2];
121 8         22 $score += _ts($UP1{$p1});
122 8         22 $score += _ts($UP2{$p2});
123 8         16 $score += _ts($UP3{$p3});
124 8         22 $score += _ts($BP1{$p1 . $p2});
125 8         19 $score += _ts($BP2{$p2 . $p3});
126 8         24 $score += _ts($UW1{$w1});
127 8         24 $score += _ts($UW2{$w2});
128 8         22 $score += _ts($UW3{$w3});
129 8         21 $score += _ts($UW4{$w4});
130 8         19 $score += _ts($UW5{$w5});
131 8         19 $score += _ts($UW6{$w6});
132 8         27 $score += _ts($BW1{$w2 . $w3});
133 8         27 $score += _ts($BW2{$w3 . $w4});
134 8         29 $score += _ts($BW3{$w4 . $w5});
135 8         26 $score += _ts($TW1{$w1 . $w2 . $w3});
136 8         24 $score += _ts($TW2{$w2 . $w3 . $w4});
137 8         28 $score += _ts($TW3{$w3 . $w4 . $w5});
138 8         36 $score += _ts($TW4{$w4 . $w5 . $w6});
139 8         23 $score += _ts($UC1{$c1});
140 8         14 $score += _ts($UC2{$c2});
141 8         16 $score += _ts($UC3{$c3});
142 8         16 $score += _ts($UC4{$c4});
143 8         14 $score += _ts($UC5{$c5});
144 8         15 $score += _ts($UC6{$c6});
145 8         54 $score += _ts($BC1{$c2 . $c3});
146 8         21 $score += _ts($BC2{$c3 . $c4});
147 8         15 $score += _ts($BC3{$c4 . $c5});
148 8         52 $score += _ts($TC1{$c1 . $c2 . $c3});
149 8         24 $score += _ts($TC2{$c2 . $c3 . $c4});
150 8         23 $score += _ts($TC3{$c3 . $c4 . $c5});
151 8         25 $score += _ts($TC4{$c4 . $c5 . $c6});
152             #$score += _ts($TC5{$c4 . $c5 . $c6});
153 8         16 $score += _ts($UQ1{$p1 . $c1});
154 8         19 $score += _ts($UQ2{$p2 . $c2});
155 8         18 $score += _ts($UQ1{$p3 . $c3});
156 8         21 $score += _ts($BQ1{$p2 . $c2 . $c3});
157 8         19 $score += _ts($BQ2{$p2 . $c3 . $c4});
158 8         19 $score += _ts($BQ3{$p3 . $c2 . $c3});
159 8         19 $score += _ts($BQ4{$p3 . $c3 . $c4});
160 8         29 $score += _ts($TQ1{$p2 . $c1 . $c2 . $c3});
161 8         23 $score += _ts($TQ2{$p2 . $c2 . $c3 . $c4});
162 8         22 $score += _ts($TQ3{$p3 . $c1 . $c2 . $c3});
163 8         24 $score += _ts($TQ4{$p3 . $c2 . $c3 . $c4});
164 8         10 my $p = "O";
165 8 100       18 if ($score > 0) {
166 5         8 push @result, $word;
167 5         6 $word = "";
168 5         8 $p = "B";
169             }
170 8         8 $p1 = $p2;
171 8         9 $p2 = $p3;
172 8         10 $p3 = $p;
173 8         42 $word .= $seg[$i];
174             }
175 1         4 push @result, $word;
176              
177 1 50       13 return wantarray ? @result : \@result;
178             }
179              
180             __END__