| [ Index ] |
PHP Cross Reference of MantisBT |
[Summary view] [Print] [Text view]
1 <?php 2 /** 3 * @version $Id: bad.php,v 1.2 2006/02/26 13:20:44 harryf Exp $ 4 * Tools for locating / replacing bad bytes in UTF-8 strings 5 * The Original Code is Mozilla Communicator client code. 6 * The Initial Developer of the Original Code is 7 * Netscape Communications Corporation. 8 * Portions created by the Initial Developer are Copyright (C) 1998 9 * the Initial Developer. All Rights Reserved. 10 * Ported to PHP by Henri Sivonen (http://hsivonen.iki.fi) 11 * Slight modifications to fit with phputf8 library by Harry Fuecks (hfuecks gmail com) 12 * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUTF8ToUnicode.cpp 13 * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUnicodeToUTF8.cpp 14 * @see http://hsivonen.iki.fi/php-utf8/ 15 * @package utf8 16 * @subpackage bad 17 * @see utf8_is_valid 18 */ 19 20 //-------------------------------------------------------------------- 21 /** 22 * Locates the first bad byte in a UTF-8 string returning it's 23 * byte index in the string 24 * PCRE Pattern to locate bad bytes in a UTF-8 string 25 * Comes from W3 FAQ: Multilingual Forms 26 * Note: modified to include full ASCII range including control chars 27 * @see http://www.w3.org/International/questions/qa-forms-utf-8 28 * @param string 29 * @return mixed integer byte index or FALSE if no bad found 30 * @package utf8 31 * @subpackage bad 32 */ 33 function utf8_bad_find($str) { 34 $UTF8_BAD = 35 '([\x00-\x7F]'. # ASCII (including control chars) 36 '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte 37 '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs 38 '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte 39 '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates 40 '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3 41 '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15 42 '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16 43 '|(.{1}))'; # invalid byte 44 $pos = 0; 45 $badList = array(); 46 while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) { 47 $bytes = strlen($matches[0]); 48 if ( isset($matches[2])) { 49 return $pos; 50 } 51 $pos += $bytes; 52 $str = substr($str,$bytes); 53 } 54 return FALSE; 55 } 56 57 //-------------------------------------------------------------------- 58 /** 59 * Locates all bad bytes in a UTF-8 string and returns a list of their 60 * byte index in the string 61 * PCRE Pattern to locate bad bytes in a UTF-8 string 62 * Comes from W3 FAQ: Multilingual Forms 63 * Note: modified to include full ASCII range including control chars 64 * @see http://www.w3.org/International/questions/qa-forms-utf-8 65 * @param string 66 * @return mixed array of integers or FALSE if no bad found 67 * @package utf8 68 * @subpackage bad 69 */ 70 function utf8_bad_findall($str) { 71 $UTF8_BAD = 72 '([\x00-\x7F]'. # ASCII (including control chars) 73 '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte 74 '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs 75 '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte 76 '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates 77 '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3 78 '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15 79 '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16 80 '|(.{1}))'; # invalid byte 81 $pos = 0; 82 $badList = array(); 83 while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) { 84 $bytes = strlen($matches[0]); 85 if ( isset($matches[2])) { 86 $badList[] = $pos; 87 } 88 $pos += $bytes; 89 $str = substr($str,$bytes); 90 } 91 if ( count($badList) > 0 ) { 92 return $badList; 93 } 94 return FALSE; 95 } 96 97 //-------------------------------------------------------------------- 98 /** 99 * Strips out any bad bytes from a UTF-8 string and returns the rest 100 * PCRE Pattern to locate bad bytes in a UTF-8 string 101 * Comes from W3 FAQ: Multilingual Forms 102 * Note: modified to include full ASCII range including control chars 103 * @see http://www.w3.org/International/questions/qa-forms-utf-8 104 * @param string 105 * @return string 106 * @package utf8 107 * @subpackage bad 108 */ 109 function utf8_bad_strip($str) { 110 $UTF8_BAD = 111 '([\x00-\x7F]'. # ASCII (including control chars) 112 '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte 113 '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs 114 '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte 115 '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates 116 '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3 117 '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15 118 '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16 119 '|(.{1}))'; # invalid byte 120 ob_start(); 121 while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) { 122 if ( !isset($matches[2])) { 123 echo $matches[0]; 124 } 125 $str = substr($str,strlen($matches[0])); 126 } 127 $result = ob_get_contents(); 128 ob_end_clean(); 129 return $result; 130 } 131 132 //-------------------------------------------------------------------- 133 /** 134 * Replace bad bytes with an alternative character - ASCII character 135 * recommended is replacement char 136 * PCRE Pattern to locate bad bytes in a UTF-8 string 137 * Comes from W3 FAQ: Multilingual Forms 138 * Note: modified to include full ASCII range including control chars 139 * @see http://www.w3.org/International/questions/qa-forms-utf-8 140 * @param string to search 141 * @param string to replace bad bytes with (defaults to '?') - use ASCII 142 * @return string 143 * @package utf8 144 * @subpackage bad 145 */ 146 function utf8_bad_replace($str, $replace = '?') { 147 $UTF8_BAD = 148 '([\x00-\x7F]'. # ASCII (including control chars) 149 '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte 150 '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs 151 '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte 152 '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates 153 '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3 154 '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15 155 '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16 156 '|(.{1}))'; # invalid byte 157 ob_start(); 158 while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) { 159 if ( !isset($matches[2])) { 160 echo $matches[0]; 161 } else { 162 echo $replace; 163 } 164 $str = substr($str,strlen($matches[0])); 165 } 166 $result = ob_get_contents(); 167 ob_end_clean(); 168 return $result; 169 } 170 171 //-------------------------------------------------------------------- 172 /** 173 * Return code from utf8_bad_identify() when a five octet sequence is detected. 174 * Note: 5 octets sequences are valid UTF-8 but are not supported by Unicode so 175 * do not represent a useful character 176 * @see utf8_bad_identify 177 * @package utf8 178 * @subpackage bad 179 */ 180 define('UTF8_BAD_5OCTET',1); 181 182 /** 183 * Return code from utf8_bad_identify() when a six octet sequence is detected. 184 * Note: 6 octets sequences are valid UTF-8 but are not supported by Unicode so 185 * do not represent a useful character 186 * @see utf8_bad_identify 187 * @package utf8 188 * @subpackage bad 189 */ 190 define('UTF8_BAD_6OCTET',2); 191 192 /** 193 * Return code from utf8_bad_identify(). 194 * Invalid octet for use as start of multi-byte UTF-8 sequence 195 * @see utf8_bad_identify 196 * @package utf8 197 * @subpackage bad 198 */ 199 define('UTF8_BAD_SEQID',3); 200 201 /** 202 * Return code from utf8_bad_identify(). 203 * From Unicode 3.1, non-shortest form is illegal 204 * @see utf8_bad_identify 205 * @package utf8 206 * @subpackage bad 207 */ 208 define('UTF8_BAD_NONSHORT',4); 209 210 /** 211 * Return code from utf8_bad_identify(). 212 * From Unicode 3.2, surrogate characters are illegal 213 * @see utf8_bad_identify 214 * @package utf8 215 * @subpackage bad 216 */ 217 define('UTF8_BAD_SURROGATE',5); 218 219 /** 220 * Return code from utf8_bad_identify(). 221 * Codepoints outside the Unicode range are illegal 222 * @see utf8_bad_identify 223 * @package utf8 224 * @subpackage bad 225 */ 226 define('UTF8_BAD_UNIOUTRANGE',6); 227 228 /** 229 * Return code from utf8_bad_identify(). 230 * Incomplete multi-octet sequence 231 * Note: this is kind of a "catch-all" 232 * @see utf8_bad_identify 233 * @package utf8 234 * @subpackage bad 235 */ 236 define('UTF8_BAD_SEQINCOMPLETE',7); 237 238 //-------------------------------------------------------------------- 239 /** 240 * Reports on the type of bad byte found in a UTF-8 string. Returns a 241 * status code on the first bad byte found 242 * @author <hsivonen@iki.fi> 243 * @param string UTF-8 encoded string 244 * @return mixed integer constant describing problem or FALSE if valid UTF-8 245 * @see utf8_bad_explain 246 * @see http://hsivonen.iki.fi/php-utf8/ 247 * @package utf8 248 * @subpackage bad 249 */ 250 function utf8_bad_identify($str, &$i) { 251 252 $mState = 0; // cached expected number of octets after the current octet 253 // until the beginning of the next UTF8 character sequence 254 $mUcs4 = 0; // cached Unicode character 255 $mBytes = 1; // cached expected number of octets in the current sequence 256 257 $len = strlen($str); 258 259 for($i = 0; $i < $len; $i++) { 260 261 $in = ord($str{$i}); 262 263 if ( $mState == 0) { 264 265 // When mState is zero we expect either a US-ASCII character or a 266 // multi-octet sequence. 267 if (0 == (0x80 & ($in))) { 268 // US-ASCII, pass straight through. 269 $mBytes = 1; 270 271 } else if (0xC0 == (0xE0 & ($in))) { 272 // First octet of 2 octet sequence 273 $mUcs4 = ($in); 274 $mUcs4 = ($mUcs4 & 0x1F) << 6; 275 $mState = 1; 276 $mBytes = 2; 277 278 } else if (0xE0 == (0xF0 & ($in))) { 279 // First octet of 3 octet sequence 280 $mUcs4 = ($in); 281 $mUcs4 = ($mUcs4 & 0x0F) << 12; 282 $mState = 2; 283 $mBytes = 3; 284 285 } else if (0xF0 == (0xF8 & ($in))) { 286 // First octet of 4 octet sequence 287 $mUcs4 = ($in); 288 $mUcs4 = ($mUcs4 & 0x07) << 18; 289 $mState = 3; 290 $mBytes = 4; 291 292 } else if (0xF8 == (0xFC & ($in))) { 293 294 /* First octet of 5 octet sequence. 295 * 296 * This is illegal because the encoded codepoint must be either 297 * (a) not the shortest form or 298 * (b) outside the Unicode range of 0-0x10FFFF. 299 */ 300 301 return UTF8_BAD_5OCTET; 302 303 } else if (0xFC == (0xFE & ($in))) { 304 305 // First octet of 6 octet sequence, see comments for 5 octet sequence. 306 return UTF8_BAD_6OCTET; 307 308 } else { 309 // Current octet is neither in the US-ASCII range nor a legal first 310 // octet of a multi-octet sequence. 311 return UTF8_BAD_SEQID; 312 313 } 314 315 } else { 316 317 // When mState is non-zero, we expect a continuation of the multi-octet 318 // sequence 319 if (0x80 == (0xC0 & ($in))) { 320 321 // Legal continuation. 322 $shift = ($mState - 1) * 6; 323 $tmp = $in; 324 $tmp = ($tmp & 0x0000003F) << $shift; 325 $mUcs4 |= $tmp; 326 327 /** 328 * End of the multi-octet sequence. mUcs4 now contains the final 329 * Unicode codepoint to be output 330 */ 331 if (0 == --$mState) { 332 333 // From Unicode 3.1, non-shortest form is illegal 334 if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || 335 ((3 == $mBytes) && ($mUcs4 < 0x0800)) || 336 ((4 == $mBytes) && ($mUcs4 < 0x10000)) ) { 337 return UTF8_BAD_NONSHORT; 338 339 // From Unicode 3.2, surrogate characters are illegal 340 } else if (($mUcs4 & 0xFFFFF800) == 0xD800) { 341 return UTF8_BAD_SURROGATE; 342 343 // Codepoints outside the Unicode range are illegal 344 } else if ($mUcs4 > 0x10FFFF) { 345 return UTF8_BAD_UNIOUTRANGE; 346 } 347 348 //initialize UTF8 cache 349 $mState = 0; 350 $mUcs4 = 0; 351 $mBytes = 1; 352 } 353 354 } else { 355 // ((0xC0 & (*in) != 0x80) && (mState != 0)) 356 // Incomplete multi-octet sequence. 357 $i--; 358 return UTF8_BAD_SEQINCOMPLETE; 359 } 360 } 361 } 362 363 if ( $mState != 0 ) { 364 // Incomplete multi-octet sequence. 365 $i--; 366 return UTF8_BAD_SEQINCOMPLETE; 367 } 368 369 // No bad octets found 370 $i = NULL; 371 return FALSE; 372 } 373 374 //-------------------------------------------------------------------- 375 /** 376 * Takes a return code from utf8_bad_identify() are returns a message 377 * (in English) explaining what the problem is. 378 * @param int return code from utf8_bad_identify 379 * @return mixed string message or FALSE if return code unknown 380 * @see utf8_bad_identify 381 * @package utf8 382 * @subpackage bad 383 */ 384 function utf8_bad_explain($code) { 385 386 switch ($code) { 387 388 case UTF8_BAD_5OCTET: 389 return 'Five octet sequences are valid UTF-8 but are not supported by Unicode'; 390 break; 391 392 case UTF8_BAD_6OCTET: 393 return 'Six octet sequences are valid UTF-8 but are not supported by Unicode'; 394 break; 395 396 case UTF8_BAD_SEQID: 397 return 'Invalid octet for use as start of multi-byte UTF-8 sequence'; 398 break; 399 400 case UTF8_BAD_NONSHORT: 401 return 'From Unicode 3.1, non-shortest form is illegal'; 402 break; 403 404 case UTF8_BAD_SURROGATE: 405 return 'From Unicode 3.2, surrogate characters are illegal'; 406 break; 407 408 case UTF8_BAD_UNIOUTRANGE: 409 return 'Codepoints outside the Unicode range are illegal'; 410 break; 411 412 case UTF8_BAD_SEQINCOMPLETE: 413 return 'Incomplete multi-octet sequence'; 414 break; 415 416 } 417 418 trigger_error('Unknown error code: '.$code,E_USER_WARNING); 419 return FALSE; 420 421 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
| Generated: Thu Jul 28 15:48:31 2011 | Cross-referenced by PHPXref 0.7 |