[ Index ]

PHP Cross Reference of MantisBT

title

Body

[close]

/library/utf8/utils/ -> position.php (source)

   1  <?php
   2  /**
   3  * Locate a byte index given a UTF-8 character index
   4  * @version $Id: position.php,v 1.1 2006/10/01 00:01:31 harryf Exp $
   5  * @package utf8
   6  * @subpackage position
   7  */
   8  
   9  //--------------------------------------------------------------------
  10  /**
  11  * Given a string and a character index in the string, in
  12  * terms of the UTF-8 character position, returns the byte
  13  * index of that character. Can be useful when you want to
  14  * PHP's native string functions but we warned, locating
  15  * the byte can be expensive
  16  * Takes variable number of parameters - first must be
  17  * the search string then 1 to n UTF-8 character positions
  18  * to obtain byte indexes for - it is more efficient to search
  19  * the string for multiple characters at once, than make
  20  * repeated calls to this function
  21  *
  22  * @author Chris Smith<chris@jalakai.co.uk>
  23  * @param string string to locate index in
  24  * @param int (n times)
  25  * @return mixed - int if only one input int, array if more
  26  * @return boolean TRUE if it's all ASCII
  27  * @package utf8
  28  * @subpackage position
  29  */
  30  function utf8_byte_position() {
  31      
  32      $args = func_get_args();
  33      $str =& array_shift($args);
  34      if (!is_string($str)) return false;
  35      
  36      $result = array();
  37      
  38      // trivial byte index, character offset pair
  39      $prev = array(0,0);
  40      
  41      // use a short piece of str to estimate bytes per character
  42      // $i (& $j) -> byte indexes into $str
  43      $i = utf8_locate_next_chr($str, 300);
  44      
  45      // $c -> character offset into $str
  46      $c = strlen(utf8_decode(substr($str,0,$i)));
  47      
  48      // deal with arguments from lowest to highest
  49      sort($args);
  50      
  51      foreach ($args as $offset) {
  52          // sanity checks FIXME
  53          
  54          // 0 is an easy check
  55          if ($offset == 0) { $result[] = 0; continue; }
  56          
  57          // ensure no endless looping
  58          $safety_valve = 50;
  59          
  60          do {
  61              
  62              if ( ($c - $prev[1]) == 0 ) {
  63                  // Hack: gone past end of string
  64                  $error = 0;
  65                  $i = strlen($str);
  66                  break;
  67              }
  68              
  69              $j = $i + (int)(($offset-$c) * ($i - $prev[0]) / ($c - $prev[1]));
  70              
  71              // correct to utf8 character boundary
  72              $j = utf8_locate_next_chr($str, $j);
  73              
  74              // save the index, offset for use next iteration
  75              $prev = array($i,$c);
  76              
  77              if ($j > $i) {
  78                  // determine new character offset
  79                  $c += strlen(utf8_decode(substr($str,$i,$j-$i)));
  80              } else {
  81                  // ditto
  82                  $c -= strlen(utf8_decode(substr($str,$j,$i-$j)));
  83              }
  84              
  85              $error = abs($c-$offset);
  86              
  87              // ready for next time around
  88              $i = $j;
  89          
  90          // from 7 it is faster to iterate over the string
  91          } while ( ($error > 7) && --$safety_valve) ;
  92          
  93          if ($error && $error <= 7) {
  94              
  95              if ($c < $offset) {
  96                  // move up
  97                  while ($error--) { $i = utf8_locate_next_chr($str,++$i); }
  98              } else {
  99                  // move down
 100                  while ($error--) { $i = utf8_locate_current_chr($str,--$i); }
 101              }
 102              
 103              // ready for next arg
 104              $c = $offset;
 105          }
 106          $result[] = $i;
 107      }
 108      
 109      if ( count($result) == 1 ) {
 110          return $result[0];
 111      }
 112      
 113      return $result;
 114  }
 115  
 116  //--------------------------------------------------------------------
 117  /**
 118  * Given a string and any byte index, returns the byte index
 119  * of the start of the current UTF-8 character, relative to supplied
 120  * position. If the current character begins at the same place as the
 121  * supplied byte index, that byte index will be returned. Otherwise
 122  * this function will step backwards, looking for the index where
 123  * curent UTF-8 character begins
 124  * @author Chris Smith<chris@jalakai.co.uk>
 125  * @param string
 126  * @param int byte index in the string
 127  * @return int byte index of start of next UTF-8 character
 128  * @package utf8
 129  * @subpackage position
 130  */
 131  function utf8_locate_current_chr( &$str, $idx ) {
 132      
 133      if ($idx <= 0) return 0;
 134      
 135      $limit = strlen($str);
 136      if ($idx >= $limit) return $limit;
 137      
 138      // Binary value for any byte after the first in a multi-byte UTF-8 character
 139      // will be like 10xxxxxx so & 0xC0 can be used to detect this kind
 140      // of byte - assuming well formed UTF-8
 141      while ($idx && ((ord($str[$idx]) & 0xC0) == 0x80)) $idx--;
 142      
 143      return $idx;
 144  }
 145  
 146  //--------------------------------------------------------------------
 147  /**
 148  * Given a string and any byte index, returns the byte index
 149  * of the start of the next UTF-8 character, relative to supplied
 150  * position. If the next character begins at the same place as the
 151  * supplied byte index, that byte index will be returned.
 152  * @author Chris Smith<chris@jalakai.co.uk>
 153  * @param string
 154  * @param int byte index in the string
 155  * @return int byte index of start of next UTF-8 character
 156  * @package utf8
 157  * @subpackage position
 158  */
 159  function utf8_locate_next_chr( &$str, $idx ) {
 160      
 161      if ($idx <= 0) return 0;
 162      
 163      $limit = strlen($str);
 164      if ($idx >= $limit) return $limit;
 165      
 166      // Binary value for any byte after the first in a multi-byte UTF-8 character
 167      // will be like 10xxxxxx so & 0xC0 can be used to detect this kind
 168      // of byte - assuming well formed UTF-8
 169      while (($idx < $limit) && ((ord($str[$idx]) & 0xC0) == 0x80)) $idx++;
 170      
 171      return $idx;
 172  }
 173  


Generated: Thu Jul 28 15:48:31 2011 Cross-referenced by PHPXref 0.7