| [ Index ] |
PHP Cross Reference of MantisBT |
[Summary view] [Print] [Text view]
1 <?php 2 /** 3 * This was an experiment to see how a PCRE based UTF-8 to unicode 4 * code point converter would perform, vs. a character by character 5 * converted (as in '../utf8_unicode.php'). Basically this is very 6 * by comparion but perhaps interesting code anyway 7 */ 8 $UTF8_MATCH = 9 '([\x09\x0A\x0D\x20-\x7E])'. # ASCII (excluding control chars) 10 '|([\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte 11 '|\xE0[\xA0-\xBF][\x80-\xBF])'. # excluding overlongs 12 '|([\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte 13 '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates 14 '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3 15 '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15 16 '|\xF4[\x80-\x8F][\x80-\xBF]{2})'; # plane 16 17 '|(.{1})'; # catch bad bytes 18 19 function toCodePoint($matches) { 20 global $points; 21 if ( $matches[1] != '' ) { 22 $points[]= ord($matches[1]); 23 } else if ( $matches[2] != '' ) { 24 $points[]= ( ( ord($matches[2][0]) % 32 ) * 64 ) + ( ord($matches[2][1]) % 64 ); 25 } else if ( $matches[3] != '' ) { 26 $points[]= ( ( ord($matches[3][0]) % 16 ) * 4096 ) + ( ( ord($matches[3][1]) % 64 ) * 64 ) + ( ord($matches[3][2]) % 64 ); 27 } else if ( $matches[4] != '' ) { 28 trigger_error('Invalid byte in UTF-8',E_USER_WARNING); 29 return ''; 30 } 31 return $matches[0]; 32 } 33 34 $str = file_get_contents('../tests/data/utf8.html'); 35 $points = array(); 36 preg_replace_callback('/'.$UTF8_MATCH.'/S','toCodePoint',$str); 37 print_r($points);
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
| Generated: Thu Jul 28 15:48:31 2011 | Cross-referenced by PHPXref 0.7 |