[ Index ]

PHP Cross Reference of MantisBT

title

Body

[close]

/library/utf8/exp/ -> regexunicode.php (source)

   1  <?php
   2  /**
   3  * This was an experiment to see how a PCRE based UTF-8 to unicode
   4  * code point converter would perform, vs. a character by character
   5  * converted (as in '../utf8_unicode.php'). Basically this is very
   6  * by comparion but perhaps interesting code anyway
   7  */
   8  $UTF8_MATCH =
   9      '([\x09\x0A\x0D\x20-\x7E])'.              # ASCII (excluding control chars)
  10      '|([\xC2-\xDF][\x80-\xBF]'.              # non-overlong 2-byte
  11      '|\xE0[\xA0-\xBF][\x80-\xBF])'.          # excluding overlongs
  12      '|([\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.   # straight 3-byte
  13      '|\xED[\x80-\x9F][\x80-\xBF]'.          # excluding surrogates
  14      '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.       # planes 1-3
  15      '|[\xF1-\xF3][\x80-\xBF]{3}'.           # planes 4-15
  16      '|\xF4[\x80-\x8F][\x80-\xBF]{2})';       # plane 16
  17      '|(.{1})';                                # catch bad bytes
  18  
  19  function toCodePoint($matches) {
  20      global $points;
  21      if ( $matches[1] != '' ) {
  22          $points[]= ord($matches[1]);
  23      } else if ( $matches[2] != '' ) {
  24          $points[]= ( ( ord($matches[2][0]) % 32 ) * 64 ) + ( ord($matches[2][1]) % 64 );
  25      } else if ( $matches[3] != '' ) {
  26          $points[]= ( ( ord($matches[3][0]) % 16 ) * 4096 ) + ( ( ord($matches[3][1]) % 64 ) * 64 ) + ( ord($matches[3][2]) % 64 );
  27      } else if ( $matches[4] != '' ) {
  28          trigger_error('Invalid byte in UTF-8',E_USER_WARNING);
  29          return '';
  30      }
  31      return $matches[0];
  32  }
  33  
  34  $str = file_get_contents('../tests/data/utf8.html');
  35  $points = array();
  36  preg_replace_callback('/'.$UTF8_MATCH.'/S','toCodePoint',$str);
  37  print_r($points);


Generated: Thu Jul 28 15:48:31 2011 Cross-referenced by PHPXref 0.7