| [ Index ] |
PHP Cross Reference of MantisBT |
[Summary view] [Print] [Text view]
1 <?php 2 /** 3 * @version $Id: validation.php,v 1.2 2006/02/26 13:20:44 harryf Exp $ 4 * Tools for validing a UTF-8 string is well formed. 5 * The Original Code is Mozilla Communicator client code. 6 * The Initial Developer of the Original Code is 7 * Netscape Communications Corporation. 8 * Portions created by the Initial Developer are Copyright (C) 1998 9 * the Initial Developer. All Rights Reserved. 10 * Ported to PHP by Henri Sivonen (http://hsivonen.iki.fi) 11 * Slight modifications to fit with phputf8 library by Harry Fuecks (hfuecks gmail com) 12 * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUTF8ToUnicode.cpp 13 * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUnicodeToUTF8.cpp 14 * @see http://hsivonen.iki.fi/php-utf8/ 15 * @package utf8 16 * @subpackage validation 17 */ 18 19 //-------------------------------------------------------------------- 20 /** 21 * Tests a string as to whether it's valid UTF-8 and supported by the 22 * Unicode standard 23 * Note: this function has been modified to simple return true or false 24 * @author <hsivonen@iki.fi> 25 * @param string UTF-8 encoded string 26 * @return boolean true if valid 27 * @see http://hsivonen.iki.fi/php-utf8/ 28 * @see utf8_compliant 29 * @package utf8 30 * @subpackage validation 31 */ 32 function utf8_is_valid($str) { 33 34 $mState = 0; // cached expected number of octets after the current octet 35 // until the beginning of the next UTF8 character sequence 36 $mUcs4 = 0; // cached Unicode character 37 $mBytes = 1; // cached expected number of octets in the current sequence 38 39 $len = strlen($str); 40 41 for($i = 0; $i < $len; $i++) { 42 43 $in = ord($str{$i}); 44 45 if ( $mState == 0) { 46 47 // When mState is zero we expect either a US-ASCII character or a 48 // multi-octet sequence. 49 if (0 == (0x80 & ($in))) { 50 // US-ASCII, pass straight through. 51 $mBytes = 1; 52 53 } else if (0xC0 == (0xE0 & ($in))) { 54 // First octet of 2 octet sequence 55 $mUcs4 = ($in); 56 $mUcs4 = ($mUcs4 & 0x1F) << 6; 57 $mState = 1; 58 $mBytes = 2; 59 60 } else if (0xE0 == (0xF0 & ($in))) { 61 // First octet of 3 octet sequence 62 $mUcs4 = ($in); 63 $mUcs4 = ($mUcs4 & 0x0F) << 12; 64 $mState = 2; 65 $mBytes = 3; 66 67 } else if (0xF0 == (0xF8 & ($in))) { 68 // First octet of 4 octet sequence 69 $mUcs4 = ($in); 70 $mUcs4 = ($mUcs4 & 0x07) << 18; 71 $mState = 3; 72 $mBytes = 4; 73 74 } else if (0xF8 == (0xFC & ($in))) { 75 /* First octet of 5 octet sequence. 76 * 77 * This is illegal because the encoded codepoint must be either 78 * (a) not the shortest form or 79 * (b) outside the Unicode range of 0-0x10FFFF. 80 * Rather than trying to resynchronize, we will carry on until the end 81 * of the sequence and let the later error handling code catch it. 82 */ 83 $mUcs4 = ($in); 84 $mUcs4 = ($mUcs4 & 0x03) << 24; 85 $mState = 4; 86 $mBytes = 5; 87 88 } else if (0xFC == (0xFE & ($in))) { 89 // First octet of 6 octet sequence, see comments for 5 octet sequence. 90 $mUcs4 = ($in); 91 $mUcs4 = ($mUcs4 & 1) << 30; 92 $mState = 5; 93 $mBytes = 6; 94 95 } else { 96 /* Current octet is neither in the US-ASCII range nor a legal first 97 * octet of a multi-octet sequence. 98 */ 99 return FALSE; 100 101 } 102 103 } else { 104 105 // When mState is non-zero, we expect a continuation of the multi-octet 106 // sequence 107 if (0x80 == (0xC0 & ($in))) { 108 109 // Legal continuation. 110 $shift = ($mState - 1) * 6; 111 $tmp = $in; 112 $tmp = ($tmp & 0x0000003F) << $shift; 113 $mUcs4 |= $tmp; 114 115 /** 116 * End of the multi-octet sequence. mUcs4 now contains the final 117 * Unicode codepoint to be output 118 */ 119 if (0 == --$mState) { 120 121 /* 122 * Check for illegal sequences and codepoints. 123 */ 124 // From Unicode 3.1, non-shortest form is illegal 125 if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || 126 ((3 == $mBytes) && ($mUcs4 < 0x0800)) || 127 ((4 == $mBytes) && ($mUcs4 < 0x10000)) || 128 (4 < $mBytes) || 129 // From Unicode 3.2, surrogate characters are illegal 130 (($mUcs4 & 0xFFFFF800) == 0xD800) || 131 // Codepoints outside the Unicode range are illegal 132 ($mUcs4 > 0x10FFFF)) { 133 134 return FALSE; 135 136 } 137 138 //initialize UTF8 cache 139 $mState = 0; 140 $mUcs4 = 0; 141 $mBytes = 1; 142 } 143 144 } else { 145 /** 146 *((0xC0 & (*in) != 0x80) && (mState != 0)) 147 * Incomplete multi-octet sequence. 148 */ 149 150 return FALSE; 151 } 152 } 153 } 154 return TRUE; 155 } 156 157 //-------------------------------------------------------------------- 158 /** 159 * Tests whether a string complies as UTF-8. This will be much 160 * faster than utf8_is_valid but will pass five and six octet 161 * UTF-8 sequences, which are not supported by Unicode and 162 * so cannot be displayed correctly in a browser. In other words 163 * it is not as strict as utf8_is_valid but it's faster. If you use 164 * is to validate user input, you place yourself at the risk that 165 * attackers will be able to inject 5 and 6 byte sequences (which 166 * may or may not be a significant risk, depending on what you are 167 * are doing) 168 * @see utf8_is_valid 169 * @see http://www.php.net/manual/en/reference.pcre.pattern.modifiers.php#54805 170 * @param string UTF-8 string to check 171 * @return boolean TRUE if string is valid UTF-8 172 * @package utf8 173 * @subpackage validation 174 */ 175 function utf8_compliant($str) { 176 if ( strlen($str) == 0 ) { 177 return TRUE; 178 } 179 // If even just the first character can be matched, when the /u 180 // modifier is used, then it's valid UTF-8. If the UTF-8 is somehow 181 // invalid, nothing at all will match, even if the string contains 182 // some valid sequences 183 return (preg_match('/^.{1}/us',$str,$ar) == 1); 184 } 185
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
| Generated: Thu Jul 28 15:48:31 2011 | Cross-referenced by PHPXref 0.7 |