[ Index ]

PHP Cross Reference of MantisBT

title

Body

[close]

/library/utf8/utils/ -> bad.php (source)

   1  <?php
   2  /**
   3  * @version $Id: bad.php,v 1.2 2006/02/26 13:20:44 harryf Exp $
   4  * Tools for locating / replacing bad bytes in UTF-8 strings
   5  * The Original Code is Mozilla Communicator client code.
   6  * The Initial Developer of the Original Code is
   7  * Netscape Communications Corporation.
   8  * Portions created by the Initial Developer are Copyright (C) 1998
   9  * the Initial Developer. All Rights Reserved.
  10  * Ported to PHP by Henri Sivonen (http://hsivonen.iki.fi)
  11  * Slight modifications to fit with phputf8 library by Harry Fuecks (hfuecks gmail com)
  12  * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUTF8ToUnicode.cpp
  13  * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUnicodeToUTF8.cpp
  14  * @see http://hsivonen.iki.fi/php-utf8/
  15  * @package utf8
  16  * @subpackage bad
  17  * @see utf8_is_valid
  18  */
  19  
  20  //--------------------------------------------------------------------
  21  /**
  22  * Locates the first bad byte in a UTF-8 string returning it's
  23  * byte index in the string
  24  * PCRE Pattern to locate bad bytes in a UTF-8 string
  25  * Comes from W3 FAQ: Multilingual Forms
  26  * Note: modified to include full ASCII range including control chars
  27  * @see http://www.w3.org/International/questions/qa-forms-utf-8
  28  * @param string
  29  * @return mixed integer byte index or FALSE if no bad found
  30  * @package utf8
  31  * @subpackage bad
  32  */
  33  function utf8_bad_find($str) {
  34      $UTF8_BAD =
  35      '([\x00-\x7F]'.                          # ASCII (including control chars)
  36      '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
  37      '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
  38      '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
  39      '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
  40      '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
  41      '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
  42      '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
  43      '|(.{1}))';                              # invalid byte
  44      $pos = 0;
  45      $badList = array();
  46      while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
  47          $bytes = strlen($matches[0]);
  48          if ( isset($matches[2])) {
  49              return $pos;
  50          }
  51          $pos += $bytes;
  52          $str = substr($str,$bytes);
  53      }
  54      return FALSE;
  55  }
  56  
  57  //--------------------------------------------------------------------
  58  /**
  59  * Locates all bad bytes in a UTF-8 string and returns a list of their
  60  * byte index in the string
  61  * PCRE Pattern to locate bad bytes in a UTF-8 string
  62  * Comes from W3 FAQ: Multilingual Forms
  63  * Note: modified to include full ASCII range including control chars
  64  * @see http://www.w3.org/International/questions/qa-forms-utf-8
  65  * @param string
  66  * @return mixed array of integers or FALSE if no bad found
  67  * @package utf8
  68  * @subpackage bad
  69  */
  70  function utf8_bad_findall($str) {
  71      $UTF8_BAD =
  72      '([\x00-\x7F]'.                          # ASCII (including control chars)
  73      '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
  74      '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
  75      '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
  76      '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
  77      '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
  78      '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
  79      '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
  80      '|(.{1}))';                              # invalid byte
  81      $pos = 0;
  82      $badList = array();
  83      while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
  84          $bytes = strlen($matches[0]);
  85          if ( isset($matches[2])) {
  86              $badList[] = $pos;
  87          }
  88          $pos += $bytes;
  89          $str = substr($str,$bytes);
  90      }
  91      if ( count($badList) > 0 ) {
  92          return $badList;
  93      }
  94      return FALSE;
  95  }
  96  
  97  //--------------------------------------------------------------------
  98  /**
  99  * Strips out any bad bytes from a UTF-8 string and returns the rest
 100  * PCRE Pattern to locate bad bytes in a UTF-8 string
 101  * Comes from W3 FAQ: Multilingual Forms
 102  * Note: modified to include full ASCII range including control chars
 103  * @see http://www.w3.org/International/questions/qa-forms-utf-8
 104  * @param string
 105  * @return string
 106  * @package utf8
 107  * @subpackage bad
 108  */
 109  function utf8_bad_strip($str) {
 110      $UTF8_BAD =
 111      '([\x00-\x7F]'.                          # ASCII (including control chars)
 112      '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
 113      '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
 114      '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
 115      '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
 116      '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
 117      '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
 118      '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
 119      '|(.{1}))';                              # invalid byte
 120      ob_start();
 121      while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
 122          if ( !isset($matches[2])) {
 123              echo $matches[0];
 124          }
 125          $str = substr($str,strlen($matches[0]));
 126      }
 127      $result = ob_get_contents();
 128      ob_end_clean();
 129      return $result;
 130  }
 131  
 132  //--------------------------------------------------------------------
 133  /**
 134  * Replace bad bytes with an alternative character - ASCII character
 135  * recommended is replacement char
 136  * PCRE Pattern to locate bad bytes in a UTF-8 string
 137  * Comes from W3 FAQ: Multilingual Forms
 138  * Note: modified to include full ASCII range including control chars
 139  * @see http://www.w3.org/International/questions/qa-forms-utf-8
 140  * @param string to search
 141  * @param string to replace bad bytes with (defaults to '?') - use ASCII
 142  * @return string
 143  * @package utf8
 144  * @subpackage bad
 145  */
 146  function utf8_bad_replace($str, $replace = '?') {
 147      $UTF8_BAD =
 148      '([\x00-\x7F]'.                          # ASCII (including control chars)
 149      '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
 150      '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
 151      '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
 152      '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
 153      '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
 154      '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
 155      '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
 156      '|(.{1}))';                              # invalid byte
 157      ob_start();
 158      while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
 159          if ( !isset($matches[2])) {
 160              echo $matches[0];
 161          } else {
 162              echo $replace;
 163          }
 164          $str = substr($str,strlen($matches[0]));
 165      }
 166      $result = ob_get_contents();
 167      ob_end_clean();
 168      return $result;
 169  }
 170  
 171  //--------------------------------------------------------------------
 172  /**
 173  * Return code from utf8_bad_identify() when a five octet sequence is detected.
 174  * Note: 5 octets sequences are valid UTF-8 but are not supported by Unicode so
 175  * do not represent a useful character
 176  * @see utf8_bad_identify
 177  * @package utf8
 178  * @subpackage bad
 179  */
 180  define('UTF8_BAD_5OCTET',1);
 181  
 182  /**
 183  * Return code from utf8_bad_identify() when a six octet sequence is detected.
 184  * Note: 6 octets sequences are valid UTF-8 but are not supported by Unicode so
 185  * do not represent a useful character
 186  * @see utf8_bad_identify
 187  * @package utf8
 188  * @subpackage bad
 189  */
 190  define('UTF8_BAD_6OCTET',2);
 191  
 192  /**
 193  * Return code from utf8_bad_identify().
 194  * Invalid octet for use as start of multi-byte UTF-8 sequence
 195  * @see utf8_bad_identify
 196  * @package utf8
 197  * @subpackage bad
 198  */
 199  define('UTF8_BAD_SEQID',3);
 200  
 201  /**
 202  * Return code from utf8_bad_identify().
 203  * From Unicode 3.1, non-shortest form is illegal
 204  * @see utf8_bad_identify
 205  * @package utf8
 206  * @subpackage bad
 207  */
 208  define('UTF8_BAD_NONSHORT',4);
 209  
 210  /**
 211  * Return code from utf8_bad_identify().
 212  * From Unicode 3.2, surrogate characters are illegal
 213  * @see utf8_bad_identify
 214  * @package utf8
 215  * @subpackage bad
 216  */
 217  define('UTF8_BAD_SURROGATE',5);
 218  
 219  /**
 220  * Return code from utf8_bad_identify().
 221  * Codepoints outside the Unicode range are illegal
 222  * @see utf8_bad_identify
 223  * @package utf8
 224  * @subpackage bad
 225  */
 226  define('UTF8_BAD_UNIOUTRANGE',6);
 227  
 228  /**
 229  * Return code from utf8_bad_identify().
 230  * Incomplete multi-octet sequence
 231  * Note: this is kind of a "catch-all"
 232  * @see utf8_bad_identify
 233  * @package utf8
 234  * @subpackage bad
 235  */
 236  define('UTF8_BAD_SEQINCOMPLETE',7);
 237  
 238  //--------------------------------------------------------------------
 239  /**
 240  * Reports on the type of bad byte found in a UTF-8 string. Returns a
 241  * status code on the first bad byte found
 242  * @author <hsivonen@iki.fi>
 243  * @param string UTF-8 encoded string
 244  * @return mixed integer constant describing problem or FALSE if valid UTF-8
 245  * @see utf8_bad_explain
 246  * @see http://hsivonen.iki.fi/php-utf8/
 247  * @package utf8
 248  * @subpackage bad
 249  */
 250  function utf8_bad_identify($str, &$i) {
 251      
 252      $mState = 0;     // cached expected number of octets after the current octet
 253                       // until the beginning of the next UTF8 character sequence
 254      $mUcs4  = 0;     // cached Unicode character
 255      $mBytes = 1;     // cached expected number of octets in the current sequence
 256      
 257      $len = strlen($str);
 258      
 259      for($i = 0; $i < $len; $i++) {
 260          
 261          $in = ord($str{$i});
 262          
 263          if ( $mState == 0) {
 264              
 265              // When mState is zero we expect either a US-ASCII character or a
 266              // multi-octet sequence.
 267              if (0 == (0x80 & ($in))) {
 268                  // US-ASCII, pass straight through.
 269                  $mBytes = 1;
 270                  
 271              } else if (0xC0 == (0xE0 & ($in))) {
 272                  // First octet of 2 octet sequence
 273                  $mUcs4 = ($in);
 274                  $mUcs4 = ($mUcs4 & 0x1F) << 6;
 275                  $mState = 1;
 276                  $mBytes = 2;
 277                  
 278              } else if (0xE0 == (0xF0 & ($in))) {
 279                  // First octet of 3 octet sequence
 280                  $mUcs4 = ($in);
 281                  $mUcs4 = ($mUcs4 & 0x0F) << 12;
 282                  $mState = 2;
 283                  $mBytes = 3;
 284                  
 285              } else if (0xF0 == (0xF8 & ($in))) {
 286                  // First octet of 4 octet sequence
 287                  $mUcs4 = ($in);
 288                  $mUcs4 = ($mUcs4 & 0x07) << 18;
 289                  $mState = 3;
 290                  $mBytes = 4;
 291                  
 292              } else if (0xF8 == (0xFC & ($in))) {
 293                  
 294                  /* First octet of 5 octet sequence.
 295                  *
 296                  * This is illegal because the encoded codepoint must be either
 297                  * (a) not the shortest form or
 298                  * (b) outside the Unicode range of 0-0x10FFFF.
 299                  */
 300                  
 301                  return UTF8_BAD_5OCTET;
 302                  
 303              } else if (0xFC == (0xFE & ($in))) {
 304                  
 305                  // First octet of 6 octet sequence, see comments for 5 octet sequence.
 306                  return UTF8_BAD_6OCTET;
 307                  
 308              } else {
 309                  // Current octet is neither in the US-ASCII range nor a legal first
 310                  // octet of a multi-octet sequence.
 311                  return UTF8_BAD_SEQID;
 312                  
 313              }
 314          
 315          } else {
 316              
 317              // When mState is non-zero, we expect a continuation of the multi-octet
 318              // sequence
 319              if (0x80 == (0xC0 & ($in))) {
 320                  
 321                  // Legal continuation.
 322                  $shift = ($mState - 1) * 6;
 323                  $tmp = $in;
 324                  $tmp = ($tmp & 0x0000003F) << $shift;
 325                  $mUcs4 |= $tmp;
 326              
 327                  /**
 328                  * End of the multi-octet sequence. mUcs4 now contains the final
 329                  * Unicode codepoint to be output
 330                  */
 331                  if (0 == --$mState) {
 332                      
 333                      // From Unicode 3.1, non-shortest form is illegal
 334                      if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
 335                          ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
 336                          ((4 == $mBytes) && ($mUcs4 < 0x10000)) ) {
 337                          return UTF8_BAD_NONSHORT;
 338                          
 339                      // From Unicode 3.2, surrogate characters are illegal
 340                      } else if (($mUcs4 & 0xFFFFF800) == 0xD800) {
 341                          return UTF8_BAD_SURROGATE;
 342                          
 343                      // Codepoints outside the Unicode range are illegal
 344                      } else if ($mUcs4 > 0x10FFFF) {
 345                          return UTF8_BAD_UNIOUTRANGE;
 346                      }
 347                      
 348                      //initialize UTF8 cache
 349                      $mState = 0;
 350                      $mUcs4  = 0;
 351                      $mBytes = 1;
 352                  }
 353              
 354              } else {
 355                  // ((0xC0 & (*in) != 0x80) && (mState != 0))
 356                  // Incomplete multi-octet sequence.
 357                  $i--;
 358                  return UTF8_BAD_SEQINCOMPLETE;
 359              }
 360          }
 361      }
 362      
 363      if ( $mState != 0 ) {
 364          // Incomplete multi-octet sequence.
 365          $i--;
 366          return UTF8_BAD_SEQINCOMPLETE;
 367      }
 368      
 369      // No bad octets found
 370      $i = NULL;
 371      return FALSE;
 372  }
 373  
 374  //--------------------------------------------------------------------
 375  /**
 376  * Takes a return code from utf8_bad_identify() are returns a message
 377  * (in English) explaining what the problem is.
 378  * @param int return code from utf8_bad_identify
 379  * @return mixed string message or FALSE if return code unknown
 380  * @see utf8_bad_identify
 381  * @package utf8
 382  * @subpackage bad
 383  */
 384  function utf8_bad_explain($code) {
 385      
 386      switch ($code) {
 387          
 388          case UTF8_BAD_5OCTET:
 389              return 'Five octet sequences are valid UTF-8 but are not supported by Unicode';
 390          break;
 391          
 392          case UTF8_BAD_6OCTET:
 393              return 'Six octet sequences are valid UTF-8 but are not supported by Unicode';
 394          break;
 395          
 396          case UTF8_BAD_SEQID:
 397              return 'Invalid octet for use as start of multi-byte UTF-8 sequence';
 398          break;
 399          
 400          case UTF8_BAD_NONSHORT:
 401              return 'From Unicode 3.1, non-shortest form is illegal';
 402          break;
 403          
 404          case UTF8_BAD_SURROGATE:
 405              return 'From Unicode 3.2, surrogate characters are illegal';
 406          break;
 407          
 408          case UTF8_BAD_UNIOUTRANGE:
 409              return 'Codepoints outside the Unicode range are illegal';
 410          break;
 411          
 412          case UTF8_BAD_SEQINCOMPLETE:
 413              return 'Incomplete multi-octet sequence';
 414          break;
 415          
 416      }
 417      
 418      trigger_error('Unknown error code: '.$code,E_USER_WARNING);
 419      return FALSE;
 420      
 421  }


Generated: Thu Jul 28 15:48:31 2011 Cross-referenced by PHPXref 0.7