Verzeichnisstruktur phpBB-3.0.0
- Veröffentlicht
- 12.12.2007
So funktioniert es
|
Auf das letzte Element klicken. Dies geht jeweils ein Schritt zurück |
Auf das Icon klicken, dies öffnet das Verzeichnis. Nochmal klicken schließt das Verzeichnis. |
|
(Beispiel Datei-Icons)
|
Auf das Icon klicken um den Quellcode anzuzeigen |
utf_normalizer.php
0001 <?php
0002 /**
0003 *
0004 * @package utf
0005 * @version $Id$
0006 * @copyright (c) 2005 phpBB Group
0007 * @license http://opensource.org/licenses/gpl-license.php GNU Public License
0008 *
0009 */
0010
0011 /**
0012 */
0013 if (!defined('IN_PHPBB'))
0014 {
0015 exit;
0016 }
0017
0018 /**
0019 * Some Unicode characters encoded in UTF-8
0020 *
0021 * Preserved for compatibility
0022 */
0023 define('UTF8_REPLACEMENT', "\xEF\xBF\xBD");
0024 define('UTF8_MAX', "\xF4\x8F\xBF\xBF");
0025 define('UTF8_FFFE', "\xEF\xBF\xBE");
0026 define('UTF8_FFFF', "\xEF\xBF\xBF");
0027 define('UTF8_SURROGATE_FIRST', "\xED\xA0\x80");
0028 define('UTF8_SURROGATE_LAST', "\xED\xBF\xBF");
0029 define('UTF8_HANGUL_FIRST', "\xEA\xB0\x80");
0030 define('UTF8_HANGUL_LAST', "\xED\x9E\xA3");
0031
0032 define('UTF8_CJK_FIRST', "\xE4\xB8\x80");
0033 define('UTF8_CJK_LAST', "\xE9\xBE\xBB");
0034 define('UTF8_CJK_B_FIRST', "\xF0\xA0\x80\x80");
0035 define('UTF8_CJK_B_LAST', "\xF0\xAA\x9B\x96");
0036
0037 // Unset global variables
0038 unset($GLOBALS['utf_jamo_index'], $GLOBALS['utf_jamo_type'], $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_combining_class'], $GLOBALS['utf_canonical_comp'], $GLOBALS['utf_canonical_decomp'], $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);
0039
0040 // NFC_QC and NFKC_QC values
0041 define('UNICODE_QC_MAYBE', 0);
0042 define('UNICODE_QC_NO', 1);
0043
0044 // Contains all the ASCII characters appearing in UTF-8, sorted by frequency
0045 define('UTF8_ASCII_RANGE', "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F");
0046
0047 // Contains all the tail bytes that can appear in the composition of a UTF-8 char
0048 define('UTF8_TRAILING_BYTES', "\xA9\xA0\xA8\x80\xAA\x99\xA7\xBB\xAB\x89\x94\x82\xB4\xA2\xAE\x83\xB0\xB9\xB8\x93\xAF\xBC\xB3\x81\xA4\xB2\x9C\xA1\xB5\xBE\xBD\xBA\x98\xAD\xB1\x84\x95\xA6\xB6\x88\x8D\x90\xB7\xBF\x92\x85\xA5\x97\x8C\x86\xA3\x8E\x9F\x8F\x87\x91\x9D\xAC\x9E\x8B\x96\x9B\x8A\x9A");
0049
0050 // Constants used by the Hangul [de]composition algorithms
0051 define('UNICODE_HANGUL_SBASE', 0xAC00);
0052 define('UNICODE_HANGUL_LBASE', 0x1100);
0053 define('UNICODE_HANGUL_VBASE', 0x1161);
0054 define('UNICODE_HANGUL_TBASE', 0x11A7);
0055 define('UNICODE_HANGUL_SCOUNT', 11172);
0056 define('UNICODE_HANGUL_LCOUNT', 19);
0057 define('UNICODE_HANGUL_VCOUNT', 21);
0058 define('UNICODE_HANGUL_TCOUNT', 28);
0059 define('UNICODE_HANGUL_NCOUNT', 588);
0060 define('UNICODE_JAMO_L', 0);
0061 define('UNICODE_JAMO_V', 1);
0062 define('UNICODE_JAMO_T', 2);
0063
0064 /**
0065 * Unicode normalization routines
0066 *
0067 * @package utf
0068 */
0069 class utf_normalizer
0070 {
0071 /**
0072 * Validate, cleanup and normalize a string
0073 *
0074 * The ultimate convenience function! Clean up invalid UTF-8 sequences,
0075 * and convert to Normal Form C, canonical composition.
0076 *
0077 * @param string &$str The dirty string
0078 * @return string The same string, all shiny and cleaned-up
0079 */
0080 function cleanup(&$str)
0081 {
0082 // The string below is the list of all autorized characters, sorted by frequency in latin text
0083 $pos = strspn($str, "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x0D");
0084 $len = strlen($str);
0085
0086 if ($pos == $len)
0087 {
0088 // ASCII strings with no special chars return immediately
0089 return;
0090 }
0091
0092 // Note: we do not check for $GLOBALS['utf_canonical_decomp']. It is assumed they are always loaded together
0093 if (!isset($GLOBALS['utf_nfc_qc']))
0094 {
0095 global $phpbb_root_path, $phpEx;
0096 include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);
0097 }
0098
0099 if (!isset($GLOBALS['utf_canonical_decomp']))
0100 {
0101 global $phpbb_root_path, $phpEx;
0102 include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
0103 }
0104
0105 // Replace any byte in the range 0x00..0x1F, except for \r, \n and \t
0106 // We replace those characters with a 0xFF byte, which is illegal in UTF-8 and will in turn be replaced with a UTF replacement char
0107 $str = strtr(
0108 $str,
0109 "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",
0110 "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"
0111 );
0112
0113 $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
0114 }
0115
0116 /**
0117 * Validate and normalize a UTF string to NFC
0118 *
0119 * @param string &$str Unchecked UTF string
0120 * @return string The string, validated and in normal form
0121 */
0122 function nfc(&$str)
0123 {
0124 $pos = strspn($str, UTF8_ASCII_RANGE);
0125 $len = strlen($str);
0126
0127 if ($pos == $len)
0128 {
0129 // ASCII strings return immediately
0130 return;
0131 }
0132
0133 if (!isset($GLOBALS['utf_nfc_qc']))
0134 {
0135 global $phpbb_root_path, $phpEx;
0136 include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);
0137 }
0138
0139 if (!isset($GLOBALS['utf_canonical_decomp']))
0140 {
0141 global $phpbb_root_path, $phpEx;
0142 include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
0143 }
0144
0145 $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
0146 }
0147
0148 /**
0149 * Validate and normalize a UTF string to NFKC
0150 *
0151 * @param string &$str Unchecked UTF string
0152 * @return string The string, validated and in normal form
0153 */
0154 function nfkc(&$str)
0155 {
0156 $pos = strspn($str, UTF8_ASCII_RANGE);
0157 $len = strlen($str);
0158
0159 if ($pos == $len)
0160 {
0161 // ASCII strings return immediately
0162 return;
0163 }
0164
0165 if (!isset($GLOBALS['utf_nfkc_qc']))
0166 {
0167 global $phpbb_root_path, $phpEx;
0168 include($phpbb_root_path . 'includes/utf/data/utf_nfkc_qc.' . $phpEx);
0169 }
0170
0171 if (!isset($GLOBALS['utf_compatibility_decomp']))
0172 {
0173 global $phpbb_root_path, $phpEx;
0174 include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);
0175 }
0176
0177 $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);
0178 }
0179
0180 /**
0181 * Validate and normalize a UTF string to NFD
0182 *
0183 * @param string &$str Unchecked UTF string
0184 * @return string The string, validated and in normal form
0185 */
0186 function nfd(&$str)
0187 {
0188 $pos = strspn($str, UTF8_ASCII_RANGE);
0189 $len = strlen($str);
0190
0191 if ($pos == $len)
0192 {
0193 // ASCII strings return immediately
0194 return;
0195 }
0196
0197 if (!isset($GLOBALS['utf_canonical_decomp']))
0198 {
0199 global $phpbb_root_path, $phpEx;
0200 include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
0201 }
0202
0203 $str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_canonical_decomp']);
0204 }
0205
0206 /**
0207 * Validate and normalize a UTF string to NFKD
0208 *
0209 * @param string &$str Unchecked UTF string
0210 * @return string The string, validated and in normal form
0211 */
0212 function nfkd(&$str)
0213 {
0214 $pos = strspn($str, UTF8_ASCII_RANGE);
0215 $len = strlen($str);
0216
0217 if ($pos == $len)
0218 {
0219 // ASCII strings return immediately
0220 return;
0221 }
0222
0223 if (!isset($GLOBALS['utf_compatibility_decomp']))
0224 {
0225 global $phpbb_root_path, $phpEx;
0226 include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);
0227 }
0228
0229 $str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_compatibility_decomp']);
0230 }
0231
0232
0233 /**
0234 * Recompose a UTF string
0235 *
0236 * @param string $str Unchecked UTF string
0237 * @param integer $pos Position of the first UTF char (in bytes)
0238 * @param integer $len Length of the string (in bytes)
0239 * @param array &$qc Quick-check array, passed by reference but never modified
0240 * @param array &$decomp_map Decomposition mapping, passed by reference but never modified
0241 * @return string The string, validated and recomposed
0242 *
0243 * @access private
0244 */
0245 function recompose($str, $pos, $len, &$qc, &$decomp_map)
0246 {
0247 global $utf_combining_class, $utf_canonical_comp, $utf_jamo_type, $utf_jamo_index;
0248
0249 // Load some commonly-used tables
0250 if (!isset($utf_jamo_index, $utf_jamo_type, $utf_combining_class))
0251 {
0252 global $phpbb_root_path, $phpEx;
0253 include($phpbb_root_path . 'includes/utf/data/utf_normalizer_common.' . $phpEx);
0254 }
0255
0256 // Load the canonical composition table
0257 if (!isset($utf_canonical_comp))
0258 {
0259 global $phpbb_root_path, $phpEx;
0260 include($phpbb_root_path . 'includes/utf/data/utf_canonical_comp.' . $phpEx);
0261 }
0262
0263 // Buffer the last ASCII char before the UTF-8 stuff if applicable
0264 $tmp = '';
0265 $i = $tmp_pos = $last_cc = 0;
0266
0267 $buffer = ($pos) ? array(++$i => $str[$pos - 1]) : array();
0268
0269 // UTF char length array
0270 // This array is used to determine the length of a UTF character.
0271 // Be $c the result of ($str[$pos] & "\xF0") --where $str is the string we're operating on and $pos
0272 // the position of the cursor--, if $utf_len_mask[$c] does not exist, the byte is an ASCII char.
0273 // Otherwise, if $utf_len_mask[$c] is greater than 0, we have a the leading byte of a multibyte character
0274 // whose length is $utf_len_mask[$c] and if it is equal to 0, the byte is a trailing byte.
0275 $utf_len_mask = array(
0276 // Leading bytes masks
0277 "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,
0278 // Trailing bytes masks
0279 "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0
0280 );
0281
0282 $extra_check = array(
0283 "\xED" => 1, "\xEF" => 1, "\xC0" => 1, "\xC1" => 1, "\xE0" => 1, "\xF0" => 1,
0284 "\xF4" => 1, "\xF5" => 1, "\xF6" => 1, "\xF7" => 1, "\xF8" => 1, "\xF9" => 1,
0285 "\xFA" => 1, "\xFB" => 1, "\xFC" => 1, "\xFD" => 1, "\xFE" => 1, "\xFF" => 1
0286 );
0287
0288 $utf_validation_mask = array(
0289 2 => "\xE0\xC0",
0290 3 => "\xF0\xC0\xC0",
0291 4 => "\xF8\xC0\xC0\xC0"
0292 );
0293
0294 $utf_validation_check = array(
0295 2 => "\xC0\x80",
0296 3 => "\xE0\x80\x80",
0297 4 => "\xF0\x80\x80\x80"
0298 );
0299
0300 // Main loop
0301 do
0302 {
0303 // STEP 0: Capture the current char and buffer it
0304 $c = $str[$pos];
0305 $c_mask = $c & "\xF0";
0306
0307 if (isset($utf_len_mask[$c_mask]))
0308 {
0309 // Byte at $pos is either a leading byte or a missplaced trailing byte
0310 if ($utf_len = $utf_len_mask[$c_mask])
0311 {
0312 // Capture the char
0313 $buffer[++$i & 7] = $utf_char = substr($str, $pos, $utf_len);
0314
0315 // Let's find out if a thorough check is needed
0316 if (isset($qc[$utf_char]))
0317 {
0318 // If the UTF char is in the qc array then it may not be in normal form. We do nothing here, the actual processing is below this "if" block
0319 }
0320 else if (isset($utf_combining_class[$utf_char]))
0321 {
0322 if ($utf_combining_class[$utf_char] < $last_cc)
0323 {
0324 // A combining character that is NOT canonically ordered
0325 }
0326 else
0327 {
0328 // A combining character that IS canonically ordered, skip to the next char
0329 $last_cc = $utf_combining_class[$utf_char];
0330
0331 $pos += $utf_len;
0332 continue;
0333 }
0334 }
0335 else
0336 {
0337 // At this point, $utf_char holds a UTF char that we know is not a NF[K]C_QC and is not a combining character.
0338 // It can be a singleton, a canonical composite, a replacement char or an even an ill-formed bunch of bytes. Let's find out
0339 $last_cc = 0;
0340
0341 // Check that we have the correct number of trailing bytes
0342 if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len])
0343 {
0344 // Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char
0345 // has been encoded in a five- or six- byte sequence
0346 if ($utf_char[0] >= "\xF8")
0347 {
0348 if ($utf_char[0] < "\xFC")
0349 {
0350 $trailing_bytes = 4;
0351 }
0352 else if ($utf_char[0] > "\xFD")
0353 {
0354 $trailing_bytes = 0;
0355 }
0356 else
0357 {
0358 $trailing_bytes = 5;
0359 }
0360 }
0361 else
0362 {
0363 $trailing_bytes = $utf_len - 1;
0364 }
0365
0366 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
0367 $pos += strspn($str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes);
0368 $tmp_pos = $pos;
0369
0370 continue;
0371 }
0372
0373 if (isset($extra_check[$c]))
0374 {
0375 switch ($c)
0376 {
0377 // Note: 0xED is quite common in Korean
0378 case "\xED":
0379 if ($utf_char >= "\xED\xA0\x80")
0380 {
0381 // Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF)
0382 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
0383 $pos += $utf_len;
0384 $tmp_pos = $pos;
0385 continue 2;
0386 }
0387 break;
0388
0389 // Note: 0xEF is quite common in Japanese
0390 case "\xEF":
0391 if ($utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF")
0392 {
0393 // U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF)
0394 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
0395 $pos += $utf_len;
0396 $tmp_pos = $pos;
0397 continue 2;
0398 }
0399 break;
0400
0401 case "\xC0":
0402 case "\xC1":
0403 if ($utf_char <= "\xC1\xBF")
0404 {
0405 // Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char
0406 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
0407 $pos += $utf_len;
0408 $tmp_pos = $pos;
0409 continue 2;
0410 }
0411 break;
0412
0413 case "\xE0":
0414 if ($utf_char <= "\xE0\x9F\xBF")
0415 {
0416 // Unicode char U+0000..U+07FF encoded in 3 bytes
0417 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
0418 $pos += $utf_len;
0419 $tmp_pos = $pos;
0420 continue 2;
0421 }
0422 break;
0423
0424 case "\xF0":
0425 if ($utf_char <= "\xF0\x8F\xBF\xBF")
0426 {
0427 // Unicode char U+0000..U+FFFF encoded in 4 bytes
0428 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
0429 $pos += $utf_len;
0430 $tmp_pos = $pos;
0431 continue 2;
0432 }
0433 break;
0434
0435 default:
0436 // Five- and six- byte sequences do not need being checked for here anymore
0437 if ($utf_char > UTF8_MAX)
0438 {
0439 // Out of the Unicode range
0440 if ($utf_char[0] < "\xF8")
0441 {
0442 $trailing_bytes = 3;
0443 }
0444 else if ($utf_char[0] < "\xFC")
0445 {
0446 $trailing_bytes = 4;
0447 }
0448 else if ($utf_char[0] > "\xFD")
0449 {
0450 $trailing_bytes = 0;
0451 }
0452 else
0453 {
0454 $trailing_bytes = 5;
0455 }
0456
0457 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
0458 $pos += strspn($str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes);
0459 $tmp_pos = $pos;
0460 continue 2;
0461 }
0462 break;
0463 }
0464 }
0465
0466 // The char is a valid starter, move the cursor and go on
0467 $pos += $utf_len;
0468 continue;
0469 }
0470 }
0471 else
0472 {
0473 // A trailing byte came out of nowhere, we will advance the cursor and treat the this byte and all following trailing bytes as if
0474 // each of them was a Unicode replacement char
0475 $spn = strspn($str, UTF8_TRAILING_BYTES, $pos);
0476 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn);
0477
0478 $pos += $spn;
0479 $tmp_pos = $pos;
0480 continue;
0481 }
0482
0483
0484 // STEP 1: Decompose current char
0485
0486 // We have found a character that is either:
0487 // - in the NFC_QC/NFKC_QC list
0488 // - a non-starter char that is not canonically ordered
0489 //
0490 // We are going to capture the shortest UTF sequence that satisfies these two conditions:
0491 //
0492 // 1 - If the sequence does not start at the begginning of the string, it must begin with a starter,
0493 // and that starter must not have the NF[K]C_QC property equal to "MAYBE"
0494 //
0495 // 2 - If the sequence does not end at the end of the string, it must end with a non-starter and be
0496 // immediately followed by a starter that is not on the QC list
0497 //
0498 $utf_seq = array();
0499 $last_cc = 0;
0500 $lpos = $pos;
0501 $pos += $utf_len;
0502
0503 if (isset($decomp_map[$utf_char]))
0504 {
0505 $_pos = 0;
0506 $_len = strlen($decomp_map[$utf_char]);
0507
0508 do
0509 {
0510 $_utf_len =& $utf_len_mask[$decomp_map[$utf_char][$_pos] & "\xF0"];
0511
0512 if (isset($_utf_len))
0513 {
0514 $utf_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
0515 $_pos += $_utf_len;
0516 }
0517 else
0518 {
0519 $utf_seq[] = $decomp_map[$utf_char][$_pos];
0520 ++$_pos;
0521 }
0522 }
0523 while ($_pos < $_len);
0524 }
0525 else
0526 {
0527 // The char is not decomposable
0528 $utf_seq = array($utf_char);
0529 }
0530
0531
0532 // STEP 2: Capture the starter
0533
0534 // Check out the combining class of the first character of the UTF sequence
0535 $k = 0;
0536 if (isset($utf_combining_class[$utf_seq[0]]) || $qc[$utf_char] == UNICODE_QC_MAYBE)
0537 {
0538 // Not a starter, inspect previous characters
0539 // The last 8 characters are kept in a buffer so that we don't have to capture them everytime.
0540 // This is enough for all real-life strings but even if it wasn't, we can capture characters in backward mode,
0541 // although it is slower than this method.
0542 //
0543 // In the following loop, $j starts at the previous buffered character ($i - 1, because current character is
0544 // at offset $i) and process them in backward mode until we find a starter.
0545 //
0546 // $k is the index on each UTF character inside of our UTF sequence. At this time, $utf_seq contains one or more
0547 // characters numbered 0 to n. $k starts at 0 and for each char we prepend we pre-decrement it and for numbering
0548 $starter_found = 0;
0549 $j_min = max(1, $i - 7);
0550
0551 for ($j = $i - 1; $j >= $j_min && $lpos > $tmp_pos; --$j)
0552 {
0553 $utf_char = $buffer[$j & 7];
0554 $lpos -= strlen($utf_char);
0555
0556 if (isset($decomp_map[$utf_char]))
0557 {
0558 // The char is a composite, decompose for storage
0559 $decomp_seq = array();
0560 $_pos = 0;
0561 $_len = strlen($decomp_map[$utf_char]);
0562
0563 do
0564 {
0565 $c = $decomp_map[$utf_char][$_pos];
0566 $_utf_len =& $utf_len_mask[$c & "\xF0"];
0567
0568 if (isset($_utf_len))
0569 {
0570 $decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
0571 $_pos += $_utf_len;
0572 }
0573 else
0574 {
0575 $decomp_seq[] = $c;
0576 ++$_pos;
0577 }
0578 }
0579 while ($_pos < $_len);
0580
0581 // Prepend the UTF sequence with our decomposed sequence
0582 if (isset($decomp_seq[1]))
0583 {
0584 // The char expanded into several chars
0585 $decomp_cnt = sizeof($decomp_seq);
0586
0587 foreach ($decomp_seq as $decomp_i => $decomp_char)
0588 {
0589 $utf_seq[$k + $decomp_i - $decomp_cnt] = $decomp_char;
0590 }
0591 $k -= $decomp_cnt;
0592 }
0593 else
0594 {
0595 // Decomposed to a single char, easier to prepend
0596 $utf_seq[--$k] = $decomp_seq[0];
0597 }
0598 }
0599 else
0600 {
0601 $utf_seq[--$k] = $utf_char;
0602 }
0603
0604 if (!isset($utf_combining_class[$utf_seq[$k]]))
0605 {
0606 // We have found our starter
0607 $starter_found = 1;
0608 break;
0609 }
0610 }
0611
0612 if (!$starter_found && $lpos > $tmp_pos)
0613 {
0614 // The starter was not found in the buffer, let's rewind some more
0615 do
0616 {
0617 // $utf_len_mask contains the masks of both leading bytes and trailing bytes. If $utf_en > 0 then it's a leading byte, otherwise it's a trailing byte.
0618 $c = $str[--$lpos];
0619 $c_mask = $c & "\xF0";
0620
0621 if (isset($utf_len_mask[$c_mask]))
0622 {
0623 // UTF byte
0624 if ($utf_len = $utf_len_mask[$c_mask])
0625 {
0626 // UTF *leading* byte
0627 $utf_char = substr($str, $lpos, $utf_len);
0628
0629 if (isset($decomp_map[$utf_char]))
0630 {
0631 // Decompose the character
0632 $decomp_seq = array();
0633 $_pos = 0;
0634 $_len = strlen($decomp_map[$utf_char]);
0635
0636 do
0637 {
0638 $c = $decomp_map[$utf_char][$_pos];
0639 $_utf_len =& $utf_len_mask[$c & "\xF0"];
0640
0641 if (isset($_utf_len))
0642 {
0643 $decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
0644 $_pos += $_utf_len;
0645 }
0646 else
0647 {
0648 $decomp_seq[] = $c;
0649 ++$_pos;
0650 }
0651 }
0652 while ($_pos < $_len);
0653
0654 // Prepend the UTF sequence with our decomposed sequence
0655 if (isset($decomp_seq[1]))
0656 {
0657 // The char expanded into several chars
0658 $decomp_cnt = sizeof($decomp_seq);
0659 foreach ($decomp_seq as $decomp_i => $utf_char)
0660 {
0661 $utf_seq[$k + $decomp_i - $decomp_cnt] = $utf_char;
0662 }
0663 $k -= $decomp_cnt;
0664 }
0665 else
0666 {
0667 // Decomposed to a single char, easier to prepend
0668 $utf_seq[--$k] = $decomp_seq[0];
0669 }
0670 }
0671 else
0672 {
0673 $utf_seq[--$k] = $utf_char;
0674 }
0675 }
0676 }
0677 else
0678 {
0679 // ASCII char
0680 $utf_seq[--$k] = $c;
0681 }
0682 }
0683 while ($lpos > $tmp_pos);
0684 }
0685 }
0686
0687
0688 // STEP 3: Capture following combining modifiers
0689
0690 while ($pos < $len)
0691 {
0692 $c_mask = $str[$pos] & "\xF0";
0693
0694 if (isset($utf_len_mask[$c_mask]))
0695 {
0696 if ($utf_len = $utf_len_mask[$c_mask])
0697 {
0698 $utf_char = substr($str, $pos, $utf_len);
0699 }
0700 else
0701 {
0702 // A trailing byte came out of nowhere
0703 // Trailing bytes are replaced with Unicode replacement chars, we will just ignore it for now, break out of the loop
0704 // as if it was a starter (replacement chars ARE starters) and let the next loop replace it
0705 break;
0706 }
0707
0708 if (isset($utf_combining_class[$utf_char]) || isset($qc[$utf_char]))
0709 {
0710 // Combining character, add it to the sequence and move the cursor
0711 if (isset($decomp_map[$utf_char]))
0712 {
0713 // Decompose the character
0714 $_pos = 0;
0715 $_len = strlen($decomp_map[$utf_char]);
0716
0717 do
0718 {
0719 $c = $decomp_map[$utf_char][$_pos];
0720 $_utf_len =& $utf_len_mask[$c & "\xF0"];
0721
0722 if (isset($_utf_len))
0723 {
0724 $utf_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
0725 $_pos += $_utf_len;
0726 }
0727 else
0728 {
0729 $utf_seq[] = $c;
0730 ++$_pos;
0731 }
0732 }
0733 while ($_pos < $_len);
0734 }
0735 else
0736 {
0737 $utf_seq[] = $utf_char;
0738 }
0739
0740 $pos += $utf_len;
0741 }
0742 else
0743 {
0744 // Combining class 0 and no QC, break out of the loop
0745 // Note: we do not know if that character is valid. If it's not, the next iteration will replace it
0746 break;
0747 }
0748 }
0749 else
0750 {
0751 // ASCII chars are starters
0752 break;
0753 }
0754 }
0755
0756
0757 // STEP 4: Sort and combine
0758
0759 // Here we sort...
0760 $k_max = $k + sizeof($utf_seq);
0761
0762 if (!$k && $k_max == 1)
0763 {
0764 // There is only one char in the UTF sequence, add it then jump to the next iteration of main loop
0765 // Note: the two commented lines below can be enabled under PHP5 for a very small performance gain in most cases
0766 // if (substr_compare($str, $utf_seq[0], $lpos, $pos - $lpos))
0767 // {
0768 $tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $utf_seq[0];
0769 $tmp_pos = $pos;
0770 // }
0771
0772 continue;
0773 }
0774
0775 // ...there we combine
0776 if (isset($utf_combining_class[$utf_seq[$k]]))
0777 {
0778 $starter = $nf_seq = '';
0779 }
0780 else
0781 {
0782 $starter = $utf_seq[$k++];
0783 $nf_seq = '';
0784 }
0785 $utf_sort = array();
0786
0787 // We add an empty char at the end of the UTF char sequence. It will act as a starter and trigger the sort/combine routine
0788 // at the end of the string without altering it
0789 $utf_seq[] = '';
0790
0791 do
0792 {
0793 $utf_char = $utf_seq[$k++];
0794
0795 if (isset($utf_combining_class[$utf_char]))
0796 {
0797 $utf_sort[$utf_combining_class[$utf_char]][] = $utf_char;
0798 }
0799 else
0800 {
0801 if (empty($utf_sort))
0802 {
0803 // No combining characters... check for a composite of the two starters
0804 if (isset($utf_canonical_comp[$starter . $utf_char]))
0805 {
0806 // Good ol' composite character
0807 $starter = $utf_canonical_comp[$starter . $utf_char];
0808 }
0809 else if (isset($utf_jamo_type[$utf_char]))
0810 {
0811 // Current char is a composable jamo
0812 if (isset($utf_jamo_type[$starter]) && $utf_jamo_type[$starter] == UNICODE_JAMO_L && $utf_jamo_type[$utf_char] == UNICODE_JAMO_V)
0813 {
0814 // We have a L jamo followed by a V jamo, we are going to prefetch the next char to see if it's a T jamo
0815 if (isset($utf_jamo_type[$utf_seq[$k]]) && $utf_jamo_type[$utf_seq[$k]] == UNICODE_JAMO_T)
0816 {
0817 // L+V+T jamos, combine to a LVT Hangul syllable ($k is incremented)
0818 $cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char] + $utf_jamo_index[$utf_seq[$k]];
0819 ++$k;
0820 }
0821 else
0822 {
0823 // L+V jamos, combine to a LV Hangul syllable
0824 $cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char];
0825 }
0826
0827 $starter = chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
0828 }
0829 else
0830 {
0831 // Non-composable jamo, just add it to the sequence
0832 $nf_seq .= $starter;
0833 $starter = $utf_char;
0834 }
0835 }
0836 else
0837 {
0838 // No composite, just add the first starter to the sequence then continue with the other one
0839 $nf_seq .= $starter;
0840 $starter = $utf_char;
0841 }
0842 }
0843 else
0844 {
0845 ksort($utf_sort);
0846
0847 // For each class of combining characters
0848 foreach ($utf_sort as $cc => $utf_chars)
0849 {
0850 $j = 0;
0851
0852 do
0853 {
0854 // Look for a composite
0855 if (isset($utf_canonical_comp[$starter . $utf_chars[$j]]))
0856 {
0857 // Found a composite, replace the starter
0858 $starter = $utf_canonical_comp[$starter . $utf_chars[$j]];
0859 unset($utf_sort[$cc][$j]);
0860 }
0861 else
0862 {
0863 // No composite, all following characters in that class are blocked
0864 break;
0865 }
0866 }
0867 while (isset($utf_sort[$cc][++$j]));
0868 }
0869
0870 // Add the starter to the normalized sequence, followed by non-starters in canonical order
0871 $nf_seq .= $starter;
0872
0873 foreach ($utf_sort as $utf_chars)
0874 {
0875 if (!empty($utf_chars))
0876 {
0877 $nf_seq .= implode('', $utf_chars);
0878 }
0879 }
0880
0881 // Reset the array and go on
0882 $utf_sort = array();
0883 $starter = $utf_char;
0884 }
0885 }
0886 }
0887 while ($k <= $k_max);
0888
0889 $tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $nf_seq;
0890 $tmp_pos = $pos;
0891 }
0892 else
0893 {
0894 // Only a ASCII char can make the program get here
0895 //
0896 // First we skip the current byte with ++$pos, then we quickly skip following ASCII chars with strspn().
0897 //
0898 // The first two "if"'s here can be removed, with the consequences of being faster on latin text (lots of ASCII) and slower on
0899 // multi-byte text (where the only ASCII chars are spaces and punctuation)
0900 if (++$pos != $len)
0901 {
0902 if ($str[$pos] < "\x80")
0903 {
0904 $pos += strspn($str, UTF8_ASCII_RANGE, ++$pos);
0905 $buffer[++$i & 7] = $str[$pos - 1];
0906 }
0907 else
0908 {
0909 $buffer[++$i & 7] = $c;
0910 }
0911 }
0912 }
0913 }
0914 while ($pos < $len);
0915
0916 // Now is time to return the string
0917 if ($tmp_pos)
0918 {
0919 // If the $tmp_pos cursor is not at the beggining of the string then at least one character was not in normal form. Replace $str with the fixed version
0920 if ($tmp_pos == $len)
0921 {
0922 // The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str
0923 return $tmp;
0924 }
0925 else
0926 {
0927 // The rightmost chunk of $str has not been appended to $tmp yet
0928 return $tmp . substr($str, $tmp_pos);
0929 }
0930 }
0931
0932 // The string was already in normal form
0933 return $str;
0934 }
0935
0936 /**
0937 * Decompose a UTF string
0938 *
0939 * @param string $str UTF string
0940 * @param integer $pos Position of the first UTF char (in bytes)
0941 * @param integer $len Length of the string (in bytes)
0942 * @param array &$decomp_map Decomposition mapping, passed by reference but never modified
0943 * @return string The string, decomposed and sorted canonically
0944 *
0945 * @access private
0946 */
0947 function decompose($str, $pos, $len, &$decomp_map)
0948 {
0949 global $utf_combining_class;
0950
0951 // Load some commonly-used tables
0952 if (!isset($utf_combining_class))
0953 {
0954 global $phpbb_root_path, $phpEx;
0955 include($phpbb_root_path . 'includes/utf/data/utf_normalizer_common.' . $phpEx);
0956 }
0957
0958 // UTF char length array
0959 $utf_len_mask = array(
0960 // Leading bytes masks
0961 "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,
0962 // Trailing bytes masks
0963 "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0
0964 );
0965
0966 // Some extra checks are triggered on the first byte of a UTF sequence
0967 $extra_check = array(
0968 "\xED" => 1, "\xEF" => 1, "\xC0" => 1, "\xC1" => 1, "\xE0" => 1, "\xF0" => 1,
0969 "\xF4" => 1, "\xF5" => 1, "\xF6" => 1, "\xF7" => 1, "\xF8" => 1, "\xF9" => 1,
0970 "\xFA" => 1, "\xFB" => 1, "\xFC" => 1, "\xFD" => 1, "\xFE" => 1, "\xFF" => 1
0971 );
0972
0973 // These masks are used to check if a UTF sequence is well formed. Here are the only 3 lengths we acknowledge:
0974 // - 2-byte: 110? ???? 10?? ????
0975 // - 3-byte: 1110 ???? 10?? ???? 10?? ????
0976 // - 4-byte: 1111 0??? 10?? ???? 10?? ???? 10?? ????
0977 // Note that 5- and 6- byte sequences are automatically discarded
0978 $utf_validation_mask = array(
0979 2 => "\xE0\xC0",
0980 3 => "\xF0\xC0\xC0",
0981 4 => "\xF8\xC0\xC0\xC0"
0982 );
0983
0984 $utf_validation_check = array(
0985 2 => "\xC0\x80",
0986 3 => "\xE0\x80\x80",
0987 4 => "\xF0\x80\x80\x80"
0988 );
0989
0990 $tmp = '';
0991 $starter_pos = $pos;
0992 $tmp_pos = $last_cc = $sort = $dump = 0;
0993 $utf_sort = array();
0994
0995
0996 // Main loop
0997 do
0998 {
0999 // STEP 0: Capture the current char
1000
1001 $cur_mask = $str[$pos] & "\xF0";
1002 if (isset($utf_len_mask[$cur_mask]))
1003 {
1004 if ($utf_len = $utf_len_mask[$cur_mask])
1005 {
1006 // Multibyte char
1007 $utf_char = substr($str, $pos, $utf_len);
1008 $pos += $utf_len;
1009 }
1010 else
1011 {
1012 // A trailing byte came out of nowhere, we will treat it and all following trailing bytes as if each of them was a Unicode
1013 // replacement char and we will advance the cursor
1014 $spn = strspn($str, UTF8_TRAILING_BYTES, $pos);
1015
1016 if ($dump)
1017 {
1018 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1019
1020 // Dump combiners
1021 if (!empty($utf_sort))
1022 {
1023 if ($sort)
1024 {
1025 ksort($utf_sort);
1026 }
1027
1028 foreach ($utf_sort as $utf_chars)
1029 {
1030 $tmp .= implode('', $utf_chars);
1031 }
1032 }
1033
1034 $tmp .= str_repeat(UTF8_REPLACEMENT, $spn);
1035 $dump = $sort = 0;
1036 }
1037 else
1038 {
1039 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn);
1040 }
1041
1042 $pos += $spn;
1043 $tmp_pos = $starter_pos = $pos;
1044
1045 $utf_sort = array();
1046 $last_cc = 0;
1047
1048 continue;
1049 }
1050
1051
1052 // STEP 1: Decide what to do with current char
1053
1054 // Now, in that order:
1055 // - check if that character is decomposable
1056 // - check if that character is a non-starter
1057 // - check if that character requires extra checks to be performed
1058 if (isset($decomp_map[$utf_char]))
1059 {
1060 // Decompose the char
1061 $_pos = 0;
1062 $_len = strlen($decomp_map[$utf_char]);
1063
1064 do
1065 {
1066 $c = $decomp_map[$utf_char][$_pos];
1067 $_utf_len =& $utf_len_mask[$c & "\xF0"];
1068
1069 if (isset($_utf_len))
1070 {
1071 $_utf_char = substr($decomp_map[$utf_char], $_pos, $_utf_len);
1072 $_pos += $_utf_len;
1073
1074 if (isset($utf_combining_class[$_utf_char]))
1075 {
1076 // The character decomposed to a non-starter, buffer it for sorting
1077 $utf_sort[$utf_combining_class[$_utf_char]][] = $_utf_char;
1078
1079 if ($utf_combining_class[$_utf_char] < $last_cc)
1080 {
1081 // Not canonically ordered, will require sorting
1082 $sort = $dump = 1;
1083 }
1084 else
1085 {
1086 $dump = 1;
1087 $last_cc = $utf_combining_class[$_utf_char];
1088 }
1089 }
1090 else
1091 {
1092 // This character decomposition contains a starter, dump the buffer and continue
1093 if ($dump)
1094 {
1095 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1096
1097 // Dump combiners
1098 if (!empty($utf_sort))
1099 {
1100 if ($sort)
1101 {
1102 ksort($utf_sort);
1103 }
1104
1105 foreach ($utf_sort as $utf_chars)
1106 {
1107 $tmp .= implode('', $utf_chars);
1108 }
1109 }
1110
1111 $tmp .= $_utf_char;
1112 $dump = $sort = 0;
1113 }
1114 else
1115 {
1116 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos) . $_utf_char;
1117 }
1118
1119 $tmp_pos = $starter_pos = $pos;
1120 $utf_sort = array();
1121 $last_cc = 0;
1122 }
1123 }
1124 else
1125 {
1126 // This character decomposition contains an ASCII char, which is a starter. Dump the buffer and continue
1127 ++$_pos;
1128
1129 if ($dump)
1130 {
1131 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1132
1133 // Dump combiners
1134 if (!empty($utf_sort))
1135 {
1136 if ($sort)
1137 {
1138 ksort($utf_sort);
1139 }
1140
1141 foreach ($utf_sort as $utf_chars)
1142 {
1143 $tmp .= implode('', $utf_chars);
1144 }
1145 }
1146
1147 $tmp .= $c;
1148 $dump = $sort = 0;
1149 }
1150 else
1151 {
1152 $tmp .= substr($str, $tmp_pos, $pos - $utf_len - $tmp_pos) . $c;
1153 }
1154
1155 $tmp_pos = $starter_pos = $pos;
1156 $utf_sort = array();
1157 $last_cc = 0;
1158 }
1159 }
1160 while ($_pos < $_len);
1161 }
1162 else if (isset($utf_combining_class[$utf_char]))
1163 {
1164 // Combining character
1165 if ($utf_combining_class[$utf_char] < $last_cc)
1166 {
1167 // Not in canonical order
1168 $sort = $dump = 1;
1169 }
1170 else
1171 {
1172 $last_cc = $utf_combining_class[$utf_char];
1173 }
1174
1175 $utf_sort[$utf_combining_class[$utf_char]][] = $utf_char;
1176 }
1177 else
1178 {
1179 // Non-decomposable starter, check out if it's a Hangul syllable
1180 if ($utf_char < UTF8_HANGUL_FIRST || $utf_char > UTF8_HANGUL_LAST)
1181 {
1182 // Nope, regular UTF char, check that we have the correct number of trailing bytes
1183 if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len])
1184 {
1185 // Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char
1186 // has been encoded in a five- or six- byte sequence.
1187 // Move the cursor back to its original position then advance it to the position it should really be at
1188 $pos -= $utf_len;
1189 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1190
1191 if (!empty($utf_sort))
1192 {
1193 ksort($utf_sort);
1194
1195 foreach ($utf_sort as $utf_chars)
1196 {
1197 $tmp .= implode('', $utf_chars);
1198 }
1199 $utf_sort = array();
1200 }
1201
1202 // Add a replacement char then another replacement char for every trailing byte.
1203 //
1204 // @todo I'm not entirely sure that's how we're supposed to mark invalidated byte sequences, check this
1205 $spn = strspn($str, UTF8_TRAILING_BYTES, ++$pos);
1206 $tmp .= str_repeat(UTF8_REPLACEMENT, $spn + 1);
1207
1208 $dump = $sort = 0;
1209
1210 $pos += $spn;
1211 $tmp_pos = $pos;
1212 continue;
1213 }
1214
1215 if (isset($extra_check[$utf_char[0]]))
1216 {
1217 switch ($utf_char[0])
1218 {
1219 // Note: 0xED is quite common in Korean
1220 case "\xED":
1221 if ($utf_char >= "\xED\xA0\x80")
1222 {
1223 // Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF)
1224 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1225
1226 if (!empty($utf_sort))
1227 {
1228 ksort($utf_sort);
1229
1230 foreach ($utf_sort as $utf_chars)
1231 {
1232 $tmp .= implode('', $utf_chars);
1233 }
1234 $utf_sort = array();
1235 }
1236
1237 $tmp .= UTF8_REPLACEMENT;
1238 $dump = $sort = 0;
1239
1240 $tmp_pos = $starter_pos = $pos;
1241 continue 2;
1242 }
1243 break;
1244
1245 // Note: 0xEF is quite common in Japanese
1246 case "\xEF":
1247 if ($utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF")
1248 {
1249 // U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF)
1250 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1251
1252 if (!empty($utf_sort))
1253 {
1254 ksort($utf_sort);
1255
1256 foreach ($utf_sort as $utf_chars)
1257 {
1258 $tmp .= implode('', $utf_chars);
1259 }
1260 $utf_sort = array();
1261 }
1262
1263 $tmp .= UTF8_REPLACEMENT;
1264 $dump = $sort = 0;
1265
1266 $tmp_pos = $starter_pos = $pos;
1267 continue 2;
1268 }
1269 break;
1270
1271 case "\xC0":
1272 case "\xC1":
1273 if ($utf_char <= "\xC1\xBF")
1274 {
1275 // Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char
1276 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1277
1278 if (!empty($utf_sort))
1279 {
1280 ksort($utf_sort);
1281
1282 foreach ($utf_sort as $utf_chars)
1283 {
1284 $tmp .= implode('', $utf_chars);
1285 }
1286 $utf_sort = array();
1287 }
1288
1289 $tmp .= UTF8_REPLACEMENT;
1290 $dump = $sort = 0;
1291
1292 $tmp_pos = $starter_pos = $pos;
1293 continue 2;
1294 }
1295 break;
1296
1297 case "\xE0":
1298 if ($utf_char <= "\xE0\x9F\xBF")
1299 {
1300 // Unicode char U+0000..U+07FF encoded in 3 bytes
1301 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1302
1303 if (!empty($utf_sort))
1304 {
1305 ksort($utf_sort);
1306
1307 foreach ($utf_sort as $utf_chars)
1308 {
1309 $tmp .= implode('', $utf_chars);
1310 }
1311 $utf_sort = array();
1312 }
1313
1314 $tmp .= UTF8_REPLACEMENT;
1315 $dump = $sort = 0;
1316
1317 $tmp_pos = $starter_pos = $pos;
1318 continue 2;
1319 }
1320 break;
1321
1322 case "\xF0":
1323 if ($utf_char <= "\xF0\x8F\xBF\xBF")
1324 {
1325 // Unicode char U+0000..U+FFFF encoded in 4 bytes
1326 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1327
1328 if (!empty($utf_sort))
1329 {
1330 ksort($utf_sort);
1331
1332 foreach ($utf_sort as $utf_chars)
1333 {
1334 $tmp .= implode('', $utf_chars);
1335 }
1336 $utf_sort = array();
1337 }
1338
1339 $tmp .= UTF8_REPLACEMENT;
1340 $dump = $sort = 0;
1341
1342 $tmp_pos = $starter_pos = $pos;
1343 continue 2;
1344 }
1345 break;
1346
1347 default:
1348 if ($utf_char > UTF8_MAX)
1349 {
1350 // Out of the Unicode range
1351 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1352
1353 if (!empty($utf_sort))
1354 {
1355 ksort($utf_sort);
1356
1357 foreach ($utf_sort as $utf_chars)
1358 {
1359 $tmp .= implode('', $utf_chars);
1360 }
1361 $utf_sort = array();
1362 }
1363
1364 $tmp .= UTF8_REPLACEMENT;
1365 $dump = $sort = 0;
1366
1367 $tmp_pos = $starter_pos = $pos;
1368 continue 2;
1369 }
1370 break;
1371 }
1372 }
1373 }
1374 else
1375 {
1376 // Hangul syllable
1377 $idx = (((ord($utf_char[0]) & 0x0F) << 12) | ((ord($utf_char[1]) & 0x3F) << 6) | (ord($utf_char[2]) & 0x3F)) - UNICODE_HANGUL_SBASE;
1378
1379 // LIndex can only range from 0 to 18, therefore it cannot influence the first two bytes of the L Jamo, which allows us to hardcode them (based on LBase).
1380 //
1381 // The same goes for VIndex, but for TIndex there's a catch: the value of the third byte could exceed 0xBF and we would have to increment the second byte
1382 if ($t_index = $idx % UNICODE_HANGUL_TCOUNT)
1383 {
1384 if ($t_index < 25)
1385 {
1386 $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x86\x00";
1387 $utf_char[8] = chr(0xA7 + $t_index);
1388 }
1389 else
1390 {
1391 $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x87\x00";
1392 $utf_char[8] = chr(0x67 + $t_index);
1393 }
1394 }
1395 else
1396 {
1397 $utf_char = "\xE1\x84\x00\xE1\x85\x00";
1398 }
1399
1400 $utf_char[2] = chr(0x80 + (int) ($idx / UNICODE_HANGUL_NCOUNT));
1401 $utf_char[5] = chr(0xA1 + (int) (($idx % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT));
1402
1403 // Just like other decompositions, the resulting Jamos must be dumped to the tmp string
1404 $dump = 1;
1405 }
1406
1407 // Do we need to dump stuff to the tmp string?
1408 if ($dump)
1409 {
1410 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1411
1412 // Dump combiners
1413 if (!empty($utf_sort))
1414 {
1415 if ($sort)
1416 {
1417 ksort($utf_sort);
1418 }
1419
1420 foreach ($utf_sort as $utf_chars)
1421 {
1422 $tmp .= implode('', $utf_chars);
1423 }
1424 }
1425
1426 $tmp .= $utf_char;
1427 $dump = $sort = 0;
1428 $tmp_pos = $pos;
1429 }
1430
1431 $last_cc = 0;
1432 $utf_sort = array();
1433 $starter_pos = $pos;
1434 }
1435 }
1436 else
1437 {
1438 // ASCII char, which happens to be a starter (as any other ASCII char)
1439 if ($dump)
1440 {
1441 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1442
1443 // Dump combiners
1444 if (!empty($utf_sort))
1445 {
1446 if ($sort)
1447 {
1448 ksort($utf_sort);
1449 }
1450
1451 foreach ($utf_sort as $utf_chars)
1452 {
1453 $tmp .= implode('', $utf_chars);
1454 }
1455 }
1456
1457 $tmp .= $str[$pos];
1458 $dump = $sort = 0;
1459 $tmp_pos = ++$pos;
1460
1461 $pos += strspn($str, UTF8_ASCII_RANGE, $pos);
1462 }
1463 else
1464 {
1465 $pos += strspn($str, UTF8_ASCII_RANGE, ++$pos);
1466 }
1467
1468 $last_cc = 0;
1469 $utf_sort = array();
1470 $starter_pos = $pos;
1471 }
1472 }
1473 while ($pos < $len);
1474
1475 // Now is time to return the string
1476 if ($dump)
1477 {
1478 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1479
1480 // Dump combiners
1481 if (!empty($utf_sort))
1482 {
1483 if ($sort)
1484 {
1485 ksort($utf_sort);
1486 }
1487
1488 foreach ($utf_sort as $utf_chars)
1489 {
1490 $tmp .= implode('', $utf_chars);
1491 }
1492 }
1493
1494 return $tmp;
1495 }
1496 else if ($tmp_pos)
1497 {
1498 // If the $tmp_pos cursor was moved then at least one character was not in normal form. Replace $str with the fixed version
1499 if ($tmp_pos == $len)
1500 {
1501 // The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str
1502 return $tmp;
1503 }
1504 else
1505 {
1506 // The rightmost chunk of $str has not been appended to $tmp yet
1507 return $tmp . substr($str, $tmp_pos);
1508 }
1509 }
1510
1511 // The string was already in normal form
1512 return $str;
1513 }
1514 }
1515
1516 ?>