| [ Index ] |
PHP Cross Reference of MantisBT |
[Summary view] [Print] [Text view]
1 <?php 2 /** 3 * @version $Id: unicode.php,v 1.2 2006/02/26 13:20:44 harryf Exp $ 4 * Tools for conversion between UTF-8 and unicode 5 * The Original Code is Mozilla Communicator client code. 6 * The Initial Developer of the Original Code is 7 * Netscape Communications Corporation. 8 * Portions created by the Initial Developer are Copyright (C) 1998 9 * the Initial Developer. All Rights Reserved. 10 * Ported to PHP by Henri Sivonen (http://hsivonen.iki.fi) 11 * Slight modifications to fit with phputf8 library by Harry Fuecks (hfuecks gmail com) 12 * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUTF8ToUnicode.cpp 13 * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUnicodeToUTF8.cpp 14 * @see http://hsivonen.iki.fi/php-utf8/ 15 * @package utf8 16 * @subpackage unicode 17 */ 18 19 //-------------------------------------------------------------------- 20 /** 21 * Takes an UTF-8 string and returns an array of ints representing the 22 * Unicode characters. Astral planes are supported ie. the ints in the 23 * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates 24 * are not allowed. 25 * Returns false if the input string isn't a valid UTF-8 octet sequence 26 * and raises a PHP error at level E_USER_WARNING 27 * Note: this function has been modified slightly in this library to 28 * trigger errors on encountering bad bytes 29 * @author <hsivonen@iki.fi> 30 * @param string UTF-8 encoded string 31 * @return mixed array of unicode code points or FALSE if UTF-8 invalid 32 * @see utf8_from_unicode 33 * @see http://hsivonen.iki.fi/php-utf8/ 34 * @package utf8 35 * @subpackage unicode 36 */ 37 function utf8_to_unicode($str) { 38 $mState = 0; // cached expected number of octets after the current octet 39 // until the beginning of the next UTF8 character sequence 40 $mUcs4 = 0; // cached Unicode character 41 $mBytes = 1; // cached expected number of octets in the current sequence 42 43 $out = array(); 44 45 $len = strlen($str); 46 47 for($i = 0; $i < $len; $i++) { 48 49 $in = ord($str{$i}); 50 51 if ( $mState == 0) { 52 53 // When mState is zero we expect either a US-ASCII character or a 54 // multi-octet sequence. 55 if (0 == (0x80 & ($in))) { 56 // US-ASCII, pass straight through. 57 $out[] = $in; 58 $mBytes = 1; 59 60 } else if (0xC0 == (0xE0 & ($in))) { 61 // First octet of 2 octet sequence 62 $mUcs4 = ($in); 63 $mUcs4 = ($mUcs4 & 0x1F) << 6; 64 $mState = 1; 65 $mBytes = 2; 66 67 } else if (0xE0 == (0xF0 & ($in))) { 68 // First octet of 3 octet sequence 69 $mUcs4 = ($in); 70 $mUcs4 = ($mUcs4 & 0x0F) << 12; 71 $mState = 2; 72 $mBytes = 3; 73 74 } else if (0xF0 == (0xF8 & ($in))) { 75 // First octet of 4 octet sequence 76 $mUcs4 = ($in); 77 $mUcs4 = ($mUcs4 & 0x07) << 18; 78 $mState = 3; 79 $mBytes = 4; 80 81 } else if (0xF8 == (0xFC & ($in))) { 82 /* First octet of 5 octet sequence. 83 * 84 * This is illegal because the encoded codepoint must be either 85 * (a) not the shortest form or 86 * (b) outside the Unicode range of 0-0x10FFFF. 87 * Rather than trying to resynchronize, we will carry on until the end 88 * of the sequence and let the later error handling code catch it. 89 */ 90 $mUcs4 = ($in); 91 $mUcs4 = ($mUcs4 & 0x03) << 24; 92 $mState = 4; 93 $mBytes = 5; 94 95 } else if (0xFC == (0xFE & ($in))) { 96 // First octet of 6 octet sequence, see comments for 5 octet sequence. 97 $mUcs4 = ($in); 98 $mUcs4 = ($mUcs4 & 1) << 30; 99 $mState = 5; 100 $mBytes = 6; 101 102 } else { 103 /* Current octet is neither in the US-ASCII range nor a legal first 104 * octet of a multi-octet sequence. 105 */ 106 trigger_error( 107 'utf8_to_unicode: Illegal sequence identifier '. 108 'in UTF-8 at byte '.$i, 109 E_USER_WARNING 110 ); 111 return FALSE; 112 113 } 114 115 } else { 116 117 // When mState is non-zero, we expect a continuation of the multi-octet 118 // sequence 119 if (0x80 == (0xC0 & ($in))) { 120 121 // Legal continuation. 122 $shift = ($mState - 1) * 6; 123 $tmp = $in; 124 $tmp = ($tmp & 0x0000003F) << $shift; 125 $mUcs4 |= $tmp; 126 127 /** 128 * End of the multi-octet sequence. mUcs4 now contains the final 129 * Unicode codepoint to be output 130 */ 131 if (0 == --$mState) { 132 133 /* 134 * Check for illegal sequences and codepoints. 135 */ 136 // From Unicode 3.1, non-shortest form is illegal 137 if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || 138 ((3 == $mBytes) && ($mUcs4 < 0x0800)) || 139 ((4 == $mBytes) && ($mUcs4 < 0x10000)) || 140 (4 < $mBytes) || 141 // From Unicode 3.2, surrogate characters are illegal 142 (($mUcs4 & 0xFFFFF800) == 0xD800) || 143 // Codepoints outside the Unicode range are illegal 144 ($mUcs4 > 0x10FFFF)) { 145 146 trigger_error( 147 'utf8_to_unicode: Illegal sequence or codepoint '. 148 'in UTF-8 at byte '.$i, 149 E_USER_WARNING 150 ); 151 152 return FALSE; 153 154 } 155 156 if (0xFEFF != $mUcs4) { 157 // BOM is legal but we don't want to output it 158 $out[] = $mUcs4; 159 } 160 161 //initialize UTF8 cache 162 $mState = 0; 163 $mUcs4 = 0; 164 $mBytes = 1; 165 } 166 167 } else { 168 /** 169 *((0xC0 & (*in) != 0x80) && (mState != 0)) 170 * Incomplete multi-octet sequence. 171 */ 172 trigger_error( 173 'utf8_to_unicode: Incomplete multi-octet '. 174 ' sequence in UTF-8 at byte '.$i, 175 E_USER_WARNING 176 ); 177 178 return FALSE; 179 } 180 } 181 } 182 return $out; 183 } 184 185 //-------------------------------------------------------------------- 186 /** 187 * Takes an array of ints representing the Unicode characters and returns 188 * a UTF-8 string. Astral planes are supported ie. the ints in the 189 * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates 190 * are not allowed. 191 * Returns false if the input array contains ints that represent 192 * surrogates or are outside the Unicode range 193 * and raises a PHP error at level E_USER_WARNING 194 * Note: this function has been modified slightly in this library to use 195 * output buffering to concatenate the UTF-8 string (faster) as well as 196 * reference the array by it's keys 197 * @param array of unicode code points representing a string 198 * @return mixed UTF-8 string or FALSE if array contains invalid code points 199 * @author <hsivonen@iki.fi> 200 * @see utf8_to_unicode 201 * @see http://hsivonen.iki.fi/php-utf8/ 202 * @package utf8 203 * @subpackage unicode 204 */ 205 function utf8_from_unicode($arr) { 206 ob_start(); 207 208 foreach (array_keys($arr) as $k) { 209 210 # ASCII range (including control chars) 211 if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) { 212 213 echo chr($arr[$k]); 214 215 # 2 byte sequence 216 } else if ($arr[$k] <= 0x07ff) { 217 218 echo chr(0xc0 | ($arr[$k] >> 6)); 219 echo chr(0x80 | ($arr[$k] & 0x003f)); 220 221 # Byte order mark (skip) 222 } else if($arr[$k] == 0xFEFF) { 223 224 // nop -- zap the BOM 225 226 # Test for illegal surrogates 227 } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) { 228 229 // found a surrogate 230 trigger_error( 231 'utf8_from_unicode: Illegal surrogate '. 232 'at index: '.$k.', value: '.$arr[$k], 233 E_USER_WARNING 234 ); 235 236 return FALSE; 237 238 # 3 byte sequence 239 } else if ($arr[$k] <= 0xffff) { 240 241 echo chr(0xe0 | ($arr[$k] >> 12)); 242 echo chr(0x80 | (($arr[$k] >> 6) & 0x003f)); 243 echo chr(0x80 | ($arr[$k] & 0x003f)); 244 245 # 4 byte sequence 246 } else if ($arr[$k] <= 0x10ffff) { 247 248 echo chr(0xf0 | ($arr[$k] >> 18)); 249 echo chr(0x80 | (($arr[$k] >> 12) & 0x3f)); 250 echo chr(0x80 | (($arr[$k] >> 6) & 0x3f)); 251 echo chr(0x80 | ($arr[$k] & 0x3f)); 252 253 } else { 254 255 trigger_error( 256 'utf8_from_unicode: Codepoint out of Unicode range '. 257 'at index: '.$k.', value: '.$arr[$k], 258 E_USER_WARNING 259 ); 260 261 // out of range 262 return FALSE; 263 } 264 } 265 266 $result = ob_get_contents(); 267 ob_end_clean(); 268 return $result; 269 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
| Generated: Thu Jul 28 15:48:31 2011 | Cross-referenced by PHPXref 0.7 |