| [ Index ] |
PHP Cross Reference of MantisBT |
[Summary view] [Print] [Text view]
1 <?php 2 /** 3 * Locate a byte index given a UTF-8 character index 4 * @version $Id: position.php,v 1.1 2006/10/01 00:01:31 harryf Exp $ 5 * @package utf8 6 * @subpackage position 7 */ 8 9 //-------------------------------------------------------------------- 10 /** 11 * Given a string and a character index in the string, in 12 * terms of the UTF-8 character position, returns the byte 13 * index of that character. Can be useful when you want to 14 * PHP's native string functions but we warned, locating 15 * the byte can be expensive 16 * Takes variable number of parameters - first must be 17 * the search string then 1 to n UTF-8 character positions 18 * to obtain byte indexes for - it is more efficient to search 19 * the string for multiple characters at once, than make 20 * repeated calls to this function 21 * 22 * @author Chris Smith<chris@jalakai.co.uk> 23 * @param string string to locate index in 24 * @param int (n times) 25 * @return mixed - int if only one input int, array if more 26 * @return boolean TRUE if it's all ASCII 27 * @package utf8 28 * @subpackage position 29 */ 30 function utf8_byte_position() { 31 32 $args = func_get_args(); 33 $str =& array_shift($args); 34 if (!is_string($str)) return false; 35 36 $result = array(); 37 38 // trivial byte index, character offset pair 39 $prev = array(0,0); 40 41 // use a short piece of str to estimate bytes per character 42 // $i (& $j) -> byte indexes into $str 43 $i = utf8_locate_next_chr($str, 300); 44 45 // $c -> character offset into $str 46 $c = strlen(utf8_decode(substr($str,0,$i))); 47 48 // deal with arguments from lowest to highest 49 sort($args); 50 51 foreach ($args as $offset) { 52 // sanity checks FIXME 53 54 // 0 is an easy check 55 if ($offset == 0) { $result[] = 0; continue; } 56 57 // ensure no endless looping 58 $safety_valve = 50; 59 60 do { 61 62 if ( ($c - $prev[1]) == 0 ) { 63 // Hack: gone past end of string 64 $error = 0; 65 $i = strlen($str); 66 break; 67 } 68 69 $j = $i + (int)(($offset-$c) * ($i - $prev[0]) / ($c - $prev[1])); 70 71 // correct to utf8 character boundary 72 $j = utf8_locate_next_chr($str, $j); 73 74 // save the index, offset for use next iteration 75 $prev = array($i,$c); 76 77 if ($j > $i) { 78 // determine new character offset 79 $c += strlen(utf8_decode(substr($str,$i,$j-$i))); 80 } else { 81 // ditto 82 $c -= strlen(utf8_decode(substr($str,$j,$i-$j))); 83 } 84 85 $error = abs($c-$offset); 86 87 // ready for next time around 88 $i = $j; 89 90 // from 7 it is faster to iterate over the string 91 } while ( ($error > 7) && --$safety_valve) ; 92 93 if ($error && $error <= 7) { 94 95 if ($c < $offset) { 96 // move up 97 while ($error--) { $i = utf8_locate_next_chr($str,++$i); } 98 } else { 99 // move down 100 while ($error--) { $i = utf8_locate_current_chr($str,--$i); } 101 } 102 103 // ready for next arg 104 $c = $offset; 105 } 106 $result[] = $i; 107 } 108 109 if ( count($result) == 1 ) { 110 return $result[0]; 111 } 112 113 return $result; 114 } 115 116 //-------------------------------------------------------------------- 117 /** 118 * Given a string and any byte index, returns the byte index 119 * of the start of the current UTF-8 character, relative to supplied 120 * position. If the current character begins at the same place as the 121 * supplied byte index, that byte index will be returned. Otherwise 122 * this function will step backwards, looking for the index where 123 * curent UTF-8 character begins 124 * @author Chris Smith<chris@jalakai.co.uk> 125 * @param string 126 * @param int byte index in the string 127 * @return int byte index of start of next UTF-8 character 128 * @package utf8 129 * @subpackage position 130 */ 131 function utf8_locate_current_chr( &$str, $idx ) { 132 133 if ($idx <= 0) return 0; 134 135 $limit = strlen($str); 136 if ($idx >= $limit) return $limit; 137 138 // Binary value for any byte after the first in a multi-byte UTF-8 character 139 // will be like 10xxxxxx so & 0xC0 can be used to detect this kind 140 // of byte - assuming well formed UTF-8 141 while ($idx && ((ord($str[$idx]) & 0xC0) == 0x80)) $idx--; 142 143 return $idx; 144 } 145 146 //-------------------------------------------------------------------- 147 /** 148 * Given a string and any byte index, returns the byte index 149 * of the start of the next UTF-8 character, relative to supplied 150 * position. If the next character begins at the same place as the 151 * supplied byte index, that byte index will be returned. 152 * @author Chris Smith<chris@jalakai.co.uk> 153 * @param string 154 * @param int byte index in the string 155 * @return int byte index of start of next UTF-8 character 156 * @package utf8 157 * @subpackage position 158 */ 159 function utf8_locate_next_chr( &$str, $idx ) { 160 161 if ($idx <= 0) return 0; 162 163 $limit = strlen($str); 164 if ($idx >= $limit) return $limit; 165 166 // Binary value for any byte after the first in a multi-byte UTF-8 character 167 // will be like 10xxxxxx so & 0xC0 can be used to detect this kind 168 // of byte - assuming well formed UTF-8 169 while (($idx < $limit) && ((ord($str[$idx]) & 0xC0) == 0x80)) $idx++; 170 171 return $idx; 172 } 173
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
| Generated: Thu Jul 28 15:48:31 2011 | Cross-referenced by PHPXref 0.7 |