logo   

改进的函数(判断字符串是否为utf-8格式)php4.3 - - 博客园

$utf8score=utf8_probability(&$fstr);
if( 90<=$utf8score && $utf8score<=100){echo "It is encoded with utf-8 already ";die;}

echo $utf8score;

function utf8_probability(&$rawtextstr) {
$score = 0;
$i = 0;
$rawtextlen = 0;
$goodbytes = 0;
$asciibytes = 0;
$rawtextarray = preg_split("//",$rawtextstr,-1, PREG_SPLIT_NO_EMPTY); //转换成char数组,如果是php5,则可使用str_split
$rawtext = array();

//var_dump($rawtextarray);die;

for($i=0;$i<count($rawtextarray);$i++)
$rawtext[] = ord($rawtextarray[$i]); //ord(char)
// Maybe also use UTF8 Byte Order Mark(BOM): EF BB BF
//BOM,某些utf8文件流的首3个字节,可以表示这个文件的编码方式

// Check to see if characters fit into acceptable ranges
//print_r($rawtext);
$rawtextlen = strlen($rawtextstr);
for ($i = 0; $i < $rawtextlen; $i++) {
if ($rawtext[$i] < 0x80) { // One byte
$asciibytes++; // Ignore ASCII, can throw off count
} else if (0xC0 <= $rawtext[$i] && $rawtext[$i] <= 0xDF && // Two bytes
$i+1 < $rawtextlen && 0x80 <= $rawtext[$i+1] && $rawtext[$i+1] <= 0xBF) {
$goodbytes += 2; $i++;
} else if (0xE0 <= $rawtext[$i] && $rawtext[$i] <= 0xEF && // Three bytes
$i+2 < $rawtextlen && 0x80 <= $rawtext[$i+1] && $rawtext[$i+1] <= 0xBF &&
0x80 <= $rawtext[$i+2] && $rawtext[$i+2] <= 0xBF) {
$goodbytes += 3; $i+=2;
}
}
//ascii is sub of utf8
if ($asciibytes == $rawtextlen) { return 0; }
$score = (int)(100 * ($goodbytes/($rawtextlen-$asciibytes)));
// If not above 98, reduce to zero to prevent coincidental matches
if ($score > 98) {
return $score;
} else if ($score > 95 && $goodbytes > 30) {
// Allows for some (few) bad formed sequences
return $score;
} else {
return 0;
}
}
Tags:
访问文章出处: http://www.cnblogs.com/gudai/articles/168660.html(改进的函数(判断字符串是否为utf-8格式)php4.3 - - 博客园)

上一篇: 線上書籍
下一篇: 解决了数组形式的投票系统!!!续 <今天也是光棍节> 中内容_王韬-ASP PHP ASP.NET交流空间,Web Master Park!