Verzeichnisstruktur phpBB-3.1.0
- Veröffentlicht
- 27.10.2014
So funktioniert es
|
Auf das letzte Element klicken. Dies geht jeweils ein Schritt zurück |
Auf das Icon klicken, dies öffnet das Verzeichnis. Nochmal klicken schließt das Verzeichnis. |
|
(Beispiel Datei-Icons)
|
Auf das Icon klicken um den Quellcode anzuzeigen |
utf_normalizer.php
0001 <?php
0002 /**
0003 *
0004 * This file is part of the phpBB Forum Software package.
0005 *
0006 * @copyright (c) phpBB Limited <https://www.phpbb.com>
0007 * @license GNU General Public License, version 2 (GPL-2.0)
0008 *
0009 * For full copyright and license information, please see
0010 * the docs/CREDITS.txt file.
0011 *
0012 */
0013
0014 /**
0015 */
0016 if (!defined('IN_PHPBB'))
0017 {
0018 exit;
0019 }
0020
0021 /**
0022 * Some Unicode characters encoded in UTF-8
0023 *
0024 * Preserved for compatibility
0025 */
0026 define('UTF8_REPLACEMENT', "\xEF\xBF\xBD");
0027 define('UTF8_MAX', "\xF4\x8F\xBF\xBF");
0028 define('UTF8_FFFE', "\xEF\xBF\xBE");
0029 define('UTF8_FFFF', "\xEF\xBF\xBF");
0030 define('UTF8_SURROGATE_FIRST', "\xED\xA0\x80");
0031 define('UTF8_SURROGATE_LAST', "\xED\xBF\xBF");
0032 define('UTF8_HANGUL_FIRST', "\xEA\xB0\x80");
0033 define('UTF8_HANGUL_LAST', "\xED\x9E\xA3");
0034
0035 define('UTF8_CJK_FIRST', "\xE4\xB8\x80");
0036 define('UTF8_CJK_LAST', "\xE9\xBE\xBB");
0037 define('UTF8_CJK_B_FIRST', "\xF0\xA0\x80\x80");
0038 define('UTF8_CJK_B_LAST', "\xF0\xAA\x9B\x96");
0039
0040 // Unset global variables
0041 unset($GLOBALS['utf_jamo_index'], $GLOBALS['utf_jamo_type'], $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_combining_class'], $GLOBALS['utf_canonical_comp'], $GLOBALS['utf_canonical_decomp'], $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);
0042
0043 // NFC_QC and NFKC_QC values
0044 define('UNICODE_QC_MAYBE', 0);
0045 define('UNICODE_QC_NO', 1);
0046
0047 // Contains all the ASCII characters appearing in UTF-8, sorted by frequency
0048 define('UTF8_ASCII_RANGE', "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F");
0049
0050 // Contains all the tail bytes that can appear in the composition of a UTF-8 char
0051 define('UTF8_TRAILING_BYTES', "\xA9\xA0\xA8\x80\xAA\x99\xA7\xBB\xAB\x89\x94\x82\xB4\xA2\xAE\x83\xB0\xB9\xB8\x93\xAF\xBC\xB3\x81\xA4\xB2\x9C\xA1\xB5\xBE\xBD\xBA\x98\xAD\xB1\x84\x95\xA6\xB6\x88\x8D\x90\xB7\xBF\x92\x85\xA5\x97\x8C\x86\xA3\x8E\x9F\x8F\x87\x91\x9D\xAC\x9E\x8B\x96\x9B\x8A\x9A");
0052
0053 // Constants used by the Hangul [de]composition algorithms
0054 define('UNICODE_HANGUL_SBASE', 0xAC00);
0055 define('UNICODE_HANGUL_LBASE', 0x1100);
0056 define('UNICODE_HANGUL_VBASE', 0x1161);
0057 define('UNICODE_HANGUL_TBASE', 0x11A7);
0058 define('UNICODE_HANGUL_SCOUNT', 11172);
0059 define('UNICODE_HANGUL_LCOUNT', 19);
0060 define('UNICODE_HANGUL_VCOUNT', 21);
0061 define('UNICODE_HANGUL_TCOUNT', 28);
0062 define('UNICODE_HANGUL_NCOUNT', 588);
0063 define('UNICODE_JAMO_L', 0);
0064 define('UNICODE_JAMO_V', 1);
0065 define('UNICODE_JAMO_T', 2);
0066
0067 /**
0068 * Unicode normalization routines
0069 */
0070 class utf_normalizer
0071 {
0072 /**
0073 * Validate, cleanup and normalize a string
0074 *
0075 * The ultimate convenience function! Clean up invalid UTF-8 sequences,
0076 * and convert to Normal Form C, canonical composition.
0077 *
0078 * @param string &$str The dirty string
0079 * @return string The same string, all shiny and cleaned-up
0080 */
0081 static function cleanup(&$str)
0082 {
0083 // The string below is the list of all autorized characters, sorted by frequency in latin text
0084 $pos = strspn($str, "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x0D");
0085 $len = strlen($str);
0086
0087 if ($pos == $len)
0088 {
0089 // ASCII strings with no special chars return immediately
0090 return;
0091 }
0092
0093 // Note: we do not check for $GLOBALS['utf_canonical_decomp']. It is assumed they are always loaded together
0094 if (!isset($GLOBALS['utf_nfc_qc']))
0095 {
0096 global $phpbb_root_path, $phpEx;
0097 include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);
0098 }
0099
0100 if (!isset($GLOBALS['utf_canonical_decomp']))
0101 {
0102 global $phpbb_root_path, $phpEx;
0103 include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
0104 }
0105
0106 // Replace any byte in the range 0x00..0x1F, except for \r, \n and \t
0107 // We replace those characters with a 0xFF byte, which is illegal in UTF-8 and will in turn be replaced with a UTF replacement char
0108 $str = strtr(
0109 $str,
0110 "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",
0111 "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"
0112 );
0113
0114 $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
0115 }
0116
0117 /**
0118 * Validate and normalize a UTF string to NFC
0119 *
0120 * @param string &$str Unchecked UTF string
0121 * @return string The string, validated and in normal form
0122 */
0123 static function nfc(&$str)
0124 {
0125 $pos = strspn($str, UTF8_ASCII_RANGE);
0126 $len = strlen($str);
0127
0128 if ($pos == $len)
0129 {
0130 // ASCII strings return immediately
0131 return;
0132 }
0133
0134 if (!isset($GLOBALS['utf_nfc_qc']))
0135 {
0136 global $phpbb_root_path, $phpEx;
0137 include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);
0138 }
0139
0140 if (!isset($GLOBALS['utf_canonical_decomp']))
0141 {
0142 global $phpbb_root_path, $phpEx;
0143 include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
0144 }
0145
0146 $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
0147 }
0148
0149 /**
0150 * Validate and normalize a UTF string to NFKC
0151 *
0152 * @param string &$str Unchecked UTF string
0153 * @return string The string, validated and in normal form
0154 */
0155 static function nfkc(&$str)
0156 {
0157 $pos = strspn($str, UTF8_ASCII_RANGE);
0158 $len = strlen($str);
0159
0160 if ($pos == $len)
0161 {
0162 // ASCII strings return immediately
0163 return;
0164 }
0165
0166 if (!isset($GLOBALS['utf_nfkc_qc']))
0167 {
0168 global $phpbb_root_path, $phpEx;
0169 include($phpbb_root_path . 'includes/utf/data/utf_nfkc_qc.' . $phpEx);
0170 }
0171
0172 if (!isset($GLOBALS['utf_compatibility_decomp']))
0173 {
0174 global $phpbb_root_path, $phpEx;
0175 include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);
0176 }
0177
0178 $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);
0179 }
0180
0181 /**
0182 * Validate and normalize a UTF string to NFD
0183 *
0184 * @param string &$str Unchecked UTF string
0185 * @return string The string, validated and in normal form
0186 */
0187 static function nfd(&$str)
0188 {
0189 $pos = strspn($str, UTF8_ASCII_RANGE);
0190 $len = strlen($str);
0191
0192 if ($pos == $len)
0193 {
0194 // ASCII strings return immediately
0195 return;
0196 }
0197
0198 if (!isset($GLOBALS['utf_canonical_decomp']))
0199 {
0200 global $phpbb_root_path, $phpEx;
0201 include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
0202 }
0203
0204 $str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_canonical_decomp']);
0205 }
0206
0207 /**
0208 * Validate and normalize a UTF string to NFKD
0209 *
0210 * @param string &$str Unchecked UTF string
0211 * @return string The string, validated and in normal form
0212 */
0213 static function nfkd(&$str)
0214 {
0215 $pos = strspn($str, UTF8_ASCII_RANGE);
0216 $len = strlen($str);
0217
0218 if ($pos == $len)
0219 {
0220 // ASCII strings return immediately
0221 return;
0222 }
0223
0224 if (!isset($GLOBALS['utf_compatibility_decomp']))
0225 {
0226 global $phpbb_root_path, $phpEx;
0227 include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);
0228 }
0229
0230 $str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_compatibility_decomp']);
0231 }
0232
0233
0234 /**
0235 * Recompose a UTF string
0236 *
0237 * @param string $str Unchecked UTF string
0238 * @param integer $pos Position of the first UTF char (in bytes)
0239 * @param integer $len Length of the string (in bytes)
0240 * @param array &$qc Quick-check array, passed by reference but never modified
0241 * @param array &$decomp_map Decomposition mapping, passed by reference but never modified
0242 * @return string The string, validated and recomposed
0243 *
0244 * @access private
0245 */
0246 static function recompose($str, $pos, $len, &$qc, &$decomp_map)
0247 {
0248 global $utf_combining_class, $utf_canonical_comp, $utf_jamo_type, $utf_jamo_index;
0249
0250 // Load some commonly-used tables
0251 if (!isset($utf_jamo_index, $utf_jamo_type, $utf_combining_class))
0252 {
0253 global $phpbb_root_path, $phpEx;
0254 include($phpbb_root_path . 'includes/utf/data/utf_normalizer_common.' . $phpEx);
0255 }
0256
0257 // Load the canonical composition table
0258 if (!isset($utf_canonical_comp))
0259 {
0260 global $phpbb_root_path, $phpEx;
0261 include($phpbb_root_path . 'includes/utf/data/utf_canonical_comp.' . $phpEx);
0262 }
0263
0264 // Buffer the last ASCII char before the UTF-8 stuff if applicable
0265 $tmp = '';
0266 $i = $tmp_pos = $last_cc = 0;
0267
0268 $buffer = ($pos) ? array(++$i => $str[$pos - 1]) : array();
0269
0270 // UTF char length array
0271 // This array is used to determine the length of a UTF character.
0272 // Be $c the result of ($str[$pos] & "\xF0") --where $str is the string we're operating on and $pos
0273 // the position of the cursor--, if $utf_len_mask[$c] does not exist, the byte is an ASCII char.
0274 // Otherwise, if $utf_len_mask[$c] is greater than 0, we have a the leading byte of a multibyte character
0275 // whose length is $utf_len_mask[$c] and if it is equal to 0, the byte is a trailing byte.
0276 $utf_len_mask = array(
0277 // Leading bytes masks
0278 "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,
0279 // Trailing bytes masks
0280 "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0
0281 );
0282
0283 $extra_check = array(
0284 "\xED" => 1, "\xEF" => 1, "\xC0" => 1, "\xC1" => 1, "\xE0" => 1, "\xF0" => 1,
0285 "\xF4" => 1, "\xF5" => 1, "\xF6" => 1, "\xF7" => 1, "\xF8" => 1, "\xF9" => 1,
0286 "\xFA" => 1, "\xFB" => 1, "\xFC" => 1, "\xFD" => 1, "\xFE" => 1, "\xFF" => 1
0287 );
0288
0289 $utf_validation_mask = array(
0290 2 => "\xE0\xC0",
0291 3 => "\xF0\xC0\xC0",
0292 4 => "\xF8\xC0\xC0\xC0"
0293 );
0294
0295 $utf_validation_check = array(
0296 2 => "\xC0\x80",
0297 3 => "\xE0\x80\x80",
0298 4 => "\xF0\x80\x80\x80"
0299 );
0300
0301 // Main loop
0302 do
0303 {
0304 // STEP 0: Capture the current char and buffer it
0305 $c = $str[$pos];
0306 $c_mask = $c & "\xF0";
0307
0308 if (isset($utf_len_mask[$c_mask]))
0309 {
0310 // Byte at $pos is either a leading byte or a missplaced trailing byte
0311 if ($utf_len = $utf_len_mask[$c_mask])
0312 {
0313 // Capture the char
0314 $buffer[++$i & 7] = $utf_char = substr($str, $pos, $utf_len);
0315
0316 // Let's find out if a thorough check is needed
0317 if (isset($qc[$utf_char]))
0318 {
0319 // If the UTF char is in the qc array then it may not be in normal form. We do nothing here, the actual processing is below this "if" block
0320 }
0321 else if (isset($utf_combining_class[$utf_char]))
0322 {
0323 if ($utf_combining_class[$utf_char] < $last_cc)
0324 {
0325 // A combining character that is NOT canonically ordered
0326 }
0327 else
0328 {
0329 // A combining character that IS canonically ordered, skip to the next char
0330 $last_cc = $utf_combining_class[$utf_char];
0331
0332 $pos += $utf_len;
0333 continue;
0334 }
0335 }
0336 else
0337 {
0338 // At this point, $utf_char holds a UTF char that we know is not a NF[K]C_QC and is not a combining character.
0339 // It can be a singleton, a canonical composite, a replacement char or an even an ill-formed bunch of bytes. Let's find out
0340 $last_cc = 0;
0341
0342 // Check that we have the correct number of trailing bytes
0343 if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len])
0344 {
0345 // Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char
0346 // has been encoded in a five- or six- byte sequence
0347 if ($utf_char[0] >= "\xF8")
0348 {
0349 if ($utf_char[0] < "\xFC")
0350 {
0351 $trailing_bytes = 4;
0352 }
0353 else if ($utf_char[0] > "\xFD")
0354 {
0355 $trailing_bytes = 0;
0356 }
0357 else
0358 {
0359 $trailing_bytes = 5;
0360 }
0361 }
0362 else
0363 {
0364 $trailing_bytes = $utf_len - 1;
0365 }
0366
0367 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
0368 $pos += strspn($str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes);
0369 $tmp_pos = $pos;
0370
0371 continue;
0372 }
0373
0374 if (isset($extra_check[$c]))
0375 {
0376 switch ($c)
0377 {
0378 // Note: 0xED is quite common in Korean
0379 case "\xED":
0380 if ($utf_char >= "\xED\xA0\x80")
0381 {
0382 // Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF)
0383 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
0384 $pos += $utf_len;
0385 $tmp_pos = $pos;
0386 continue 2;
0387 }
0388 break;
0389
0390 // Note: 0xEF is quite common in Japanese
0391 case "\xEF":
0392 if ($utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF")
0393 {
0394 // U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF)
0395 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
0396 $pos += $utf_len;
0397 $tmp_pos = $pos;
0398 continue 2;
0399 }
0400 break;
0401
0402 case "\xC0":
0403 case "\xC1":
0404 if ($utf_char <= "\xC1\xBF")
0405 {
0406 // Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char
0407 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
0408 $pos += $utf_len;
0409 $tmp_pos = $pos;
0410 continue 2;
0411 }
0412 break;
0413
0414 case "\xE0":
0415 if ($utf_char <= "\xE0\x9F\xBF")
0416 {
0417 // Unicode char U+0000..U+07FF encoded in 3 bytes
0418 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
0419 $pos += $utf_len;
0420 $tmp_pos = $pos;
0421 continue 2;
0422 }
0423 break;
0424
0425 case "\xF0":
0426 if ($utf_char <= "\xF0\x8F\xBF\xBF")
0427 {
0428 // Unicode char U+0000..U+FFFF encoded in 4 bytes
0429 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
0430 $pos += $utf_len;
0431 $tmp_pos = $pos;
0432 continue 2;
0433 }
0434 break;
0435
0436 default:
0437 // Five- and six- byte sequences do not need being checked for here anymore
0438 if ($utf_char > UTF8_MAX)
0439 {
0440 // Out of the Unicode range
0441 if ($utf_char[0] < "\xF8")
0442 {
0443 $trailing_bytes = 3;
0444 }
0445 else if ($utf_char[0] < "\xFC")
0446 {
0447 $trailing_bytes = 4;
0448 }
0449 else if ($utf_char[0] > "\xFD")
0450 {
0451 $trailing_bytes = 0;
0452 }
0453 else
0454 {
0455 $trailing_bytes = 5;
0456 }
0457
0458 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
0459 $pos += strspn($str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes);
0460 $tmp_pos = $pos;
0461 continue 2;
0462 }
0463 break;
0464 }
0465 }
0466
0467 // The char is a valid starter, move the cursor and go on
0468 $pos += $utf_len;
0469 continue;
0470 }
0471 }
0472 else
0473 {
0474 // A trailing byte came out of nowhere, we will advance the cursor and treat the this byte and all following trailing bytes as if
0475 // each of them was a Unicode replacement char
0476 $spn = strspn($str, UTF8_TRAILING_BYTES, $pos);
0477 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn);
0478
0479 $pos += $spn;
0480 $tmp_pos = $pos;
0481 continue;
0482 }
0483
0484 // STEP 1: Decompose current char
0485
0486 // We have found a character that is either:
0487 // - in the NFC_QC/NFKC_QC list
0488 // - a non-starter char that is not canonically ordered
0489 //
0490 // We are going to capture the shortest UTF sequence that satisfies these two conditions:
0491 //
0492 // 1 - If the sequence does not start at the begginning of the string, it must begin with a starter,
0493 // and that starter must not have the NF[K]C_QC property equal to "MAYBE"
0494 //
0495 // 2 - If the sequence does not end at the end of the string, it must end with a non-starter and be
0496 // immediately followed by a starter that is not on the QC list
0497 //
0498 $utf_seq = array();
0499 $last_cc = 0;
0500 $lpos = $pos;
0501 $pos += $utf_len;
0502
0503 if (isset($decomp_map[$utf_char]))
0504 {
0505 $_pos = 0;
0506 $_len = strlen($decomp_map[$utf_char]);
0507
0508 do
0509 {
0510 $_utf_len =& $utf_len_mask[$decomp_map[$utf_char][$_pos] & "\xF0"];
0511
0512 if (isset($_utf_len))
0513 {
0514 $utf_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
0515 $_pos += $_utf_len;
0516 }
0517 else
0518 {
0519 $utf_seq[] = $decomp_map[$utf_char][$_pos];
0520 ++$_pos;
0521 }
0522 }
0523 while ($_pos < $_len);
0524 }
0525 else
0526 {
0527 // The char is not decomposable
0528 $utf_seq = array($utf_char);
0529 }
0530
0531 // STEP 2: Capture the starter
0532
0533 // Check out the combining class of the first character of the UTF sequence
0534 $k = 0;
0535 if (isset($utf_combining_class[$utf_seq[0]]) || $qc[$utf_char] == UNICODE_QC_MAYBE)
0536 {
0537 // Not a starter, inspect previous characters
0538 // The last 8 characters are kept in a buffer so that we don't have to capture them everytime.
0539 // This is enough for all real-life strings but even if it wasn't, we can capture characters in backward mode,
0540 // although it is slower than this method.
0541 //
0542 // In the following loop, $j starts at the previous buffered character ($i - 1, because current character is
0543 // at offset $i) and process them in backward mode until we find a starter.
0544 //
0545 // $k is the index on each UTF character inside of our UTF sequence. At this time, $utf_seq contains one or more
0546 // characters numbered 0 to n. $k starts at 0 and for each char we prepend we pre-decrement it and for numbering
0547 $starter_found = 0;
0548 $j_min = max(1, $i - 7);
0549
0550 for ($j = $i - 1; $j >= $j_min && $lpos > $tmp_pos; --$j)
0551 {
0552 $utf_char = $buffer[$j & 7];
0553 $lpos -= strlen($utf_char);
0554
0555 if (isset($decomp_map[$utf_char]))
0556 {
0557 // The char is a composite, decompose for storage
0558 $decomp_seq = array();
0559 $_pos = 0;
0560 $_len = strlen($decomp_map[$utf_char]);
0561
0562 do
0563 {
0564 $c = $decomp_map[$utf_char][$_pos];
0565 $_utf_len =& $utf_len_mask[$c & "\xF0"];
0566
0567 if (isset($_utf_len))
0568 {
0569 $decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
0570 $_pos += $_utf_len;
0571 }
0572 else
0573 {
0574 $decomp_seq[] = $c;
0575 ++$_pos;
0576 }
0577 }
0578 while ($_pos < $_len);
0579
0580 // Prepend the UTF sequence with our decomposed sequence
0581 if (isset($decomp_seq[1]))
0582 {
0583 // The char expanded into several chars
0584 $decomp_cnt = sizeof($decomp_seq);
0585
0586 foreach ($decomp_seq as $decomp_i => $decomp_char)
0587 {
0588 $utf_seq[$k + $decomp_i - $decomp_cnt] = $decomp_char;
0589 }
0590 $k -= $decomp_cnt;
0591 }
0592 else
0593 {
0594 // Decomposed to a single char, easier to prepend
0595 $utf_seq[--$k] = $decomp_seq[0];
0596 }
0597 }
0598 else
0599 {
0600 $utf_seq[--$k] = $utf_char;
0601 }
0602
0603 if (!isset($utf_combining_class[$utf_seq[$k]]))
0604 {
0605 // We have found our starter
0606 $starter_found = 1;
0607 break;
0608 }
0609 }
0610
0611 if (!$starter_found && $lpos > $tmp_pos)
0612 {
0613 // The starter was not found in the buffer, let's rewind some more
0614 do
0615 {
0616 // $utf_len_mask contains the masks of both leading bytes and trailing bytes. If $utf_en > 0 then it's a leading byte, otherwise it's a trailing byte.
0617 $c = $str[--$lpos];
0618 $c_mask = $c & "\xF0";
0619
0620 if (isset($utf_len_mask[$c_mask]))
0621 {
0622 // UTF byte
0623 if ($utf_len = $utf_len_mask[$c_mask])
0624 {
0625 // UTF *leading* byte
0626 $utf_char = substr($str, $lpos, $utf_len);
0627
0628 if (isset($decomp_map[$utf_char]))
0629 {
0630 // Decompose the character
0631 $decomp_seq = array();
0632 $_pos = 0;
0633 $_len = strlen($decomp_map[$utf_char]);
0634
0635 do
0636 {
0637 $c = $decomp_map[$utf_char][$_pos];
0638 $_utf_len =& $utf_len_mask[$c & "\xF0"];
0639
0640 if (isset($_utf_len))
0641 {
0642 $decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
0643 $_pos += $_utf_len;
0644 }
0645 else
0646 {
0647 $decomp_seq[] = $c;
0648 ++$_pos;
0649 }
0650 }
0651 while ($_pos < $_len);
0652
0653 // Prepend the UTF sequence with our decomposed sequence
0654 if (isset($decomp_seq[1]))
0655 {
0656 // The char expanded into several chars
0657 $decomp_cnt = sizeof($decomp_seq);
0658 foreach ($decomp_seq as $decomp_i => $utf_char)
0659 {
0660 $utf_seq[$k + $decomp_i - $decomp_cnt] = $utf_char;
0661 }
0662 $k -= $decomp_cnt;
0663 }
0664 else
0665 {
0666 // Decomposed to a single char, easier to prepend
0667 $utf_seq[--$k] = $decomp_seq[0];
0668 }
0669 }
0670 else
0671 {
0672 $utf_seq[--$k] = $utf_char;
0673 }
0674 }
0675 }
0676 else
0677 {
0678 // ASCII char
0679 $utf_seq[--$k] = $c;
0680 }
0681 }
0682 while ($lpos > $tmp_pos);
0683 }
0684 }
0685
0686 // STEP 3: Capture following combining modifiers
0687
0688 while ($pos < $len)
0689 {
0690 $c_mask = $str[$pos] & "\xF0";
0691
0692 if (isset($utf_len_mask[$c_mask]))
0693 {
0694 if ($utf_len = $utf_len_mask[$c_mask])
0695 {
0696 $utf_char = substr($str, $pos, $utf_len);
0697 }
0698 else
0699 {
0700 // A trailing byte came out of nowhere
0701 // Trailing bytes are replaced with Unicode replacement chars, we will just ignore it for now, break out of the loop
0702 // as if it was a starter (replacement chars ARE starters) and let the next loop replace it
0703 break;
0704 }
0705
0706 if (isset($utf_combining_class[$utf_char]) || isset($qc[$utf_char]))
0707 {
0708 // Combining character, add it to the sequence and move the cursor
0709 if (isset($decomp_map[$utf_char]))
0710 {
0711 // Decompose the character
0712 $_pos = 0;
0713 $_len = strlen($decomp_map[$utf_char]);
0714
0715 do
0716 {
0717 $c = $decomp_map[$utf_char][$_pos];
0718 $_utf_len =& $utf_len_mask[$c & "\xF0"];
0719
0720 if (isset($_utf_len))
0721 {
0722 $utf_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
0723 $_pos += $_utf_len;
0724 }
0725 else
0726 {
0727 $utf_seq[] = $c;
0728 ++$_pos;
0729 }
0730 }
0731 while ($_pos < $_len);
0732 }
0733 else
0734 {
0735 $utf_seq[] = $utf_char;
0736 }
0737
0738 $pos += $utf_len;
0739 }
0740 else
0741 {
0742 // Combining class 0 and no QC, break out of the loop
0743 // Note: we do not know if that character is valid. If it's not, the next iteration will replace it
0744 break;
0745 }
0746 }
0747 else
0748 {
0749 // ASCII chars are starters
0750 break;
0751 }
0752 }
0753
0754 // STEP 4: Sort and combine
0755
0756 // Here we sort...
0757 $k_max = $k + sizeof($utf_seq);
0758
0759 if (!$k && $k_max == 1)
0760 {
0761 // There is only one char in the UTF sequence, add it then jump to the next iteration of main loop
0762 // Note: the two commented lines below can be enabled under PHP5 for a very small performance gain in most cases
0763 // if (substr_compare($str, $utf_seq[0], $lpos, $pos - $lpos))
0764 // {
0765 $tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $utf_seq[0];
0766 $tmp_pos = $pos;
0767 // }
0768
0769 continue;
0770 }
0771
0772 // ...there we combine
0773 if (isset($utf_combining_class[$utf_seq[$k]]))
0774 {
0775 $starter = $nf_seq = '';
0776 }
0777 else
0778 {
0779 $starter = $utf_seq[$k++];
0780 $nf_seq = '';
0781 }
0782 $utf_sort = array();
0783
0784 // We add an empty char at the end of the UTF char sequence. It will act as a starter and trigger the sort/combine routine
0785 // at the end of the string without altering it
0786 $utf_seq[] = '';
0787
0788 do
0789 {
0790 $utf_char = $utf_seq[$k++];
0791
0792 if (isset($utf_combining_class[$utf_char]))
0793 {
0794 $utf_sort[$utf_combining_class[$utf_char]][] = $utf_char;
0795 }
0796 else
0797 {
0798 if (empty($utf_sort))
0799 {
0800 // No combining characters... check for a composite of the two starters
0801 if (isset($utf_canonical_comp[$starter . $utf_char]))
0802 {
0803 // Good ol' composite character
0804 $starter = $utf_canonical_comp[$starter . $utf_char];
0805 }
0806 else if (isset($utf_jamo_type[$utf_char]))
0807 {
0808 // Current char is a composable jamo
0809 if (isset($utf_jamo_type[$starter]) && $utf_jamo_type[$starter] == UNICODE_JAMO_L && $utf_jamo_type[$utf_char] == UNICODE_JAMO_V)
0810 {
0811 // We have a L jamo followed by a V jamo, we are going to prefetch the next char to see if it's a T jamo
0812 if (isset($utf_jamo_type[$utf_seq[$k]]) && $utf_jamo_type[$utf_seq[$k]] == UNICODE_JAMO_T)
0813 {
0814 // L+V+T jamos, combine to a LVT Hangul syllable ($k is incremented)
0815 $cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char] + $utf_jamo_index[$utf_seq[$k]];
0816 ++$k;
0817 }
0818 else
0819 {
0820 // L+V jamos, combine to a LV Hangul syllable
0821 $cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char];
0822 }
0823
0824 $starter = chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
0825 }
0826 else
0827 {
0828 // Non-composable jamo, just add it to the sequence
0829 $nf_seq .= $starter;
0830 $starter = $utf_char;
0831 }
0832 }
0833 else
0834 {
0835 // No composite, just add the first starter to the sequence then continue with the other one
0836 $nf_seq .= $starter;
0837 $starter = $utf_char;
0838 }
0839 }
0840 else
0841 {
0842 ksort($utf_sort);
0843
0844 // For each class of combining characters
0845 foreach ($utf_sort as $cc => $utf_chars)
0846 {
0847 $j = 0;
0848
0849 do
0850 {
0851 // Look for a composite
0852 if (isset($utf_canonical_comp[$starter . $utf_chars[$j]]))
0853 {
0854 // Found a composite, replace the starter
0855 $starter = $utf_canonical_comp[$starter . $utf_chars[$j]];
0856 unset($utf_sort[$cc][$j]);
0857 }
0858 else
0859 {
0860 // No composite, all following characters in that class are blocked
0861 break;
0862 }
0863 }
0864 while (isset($utf_sort[$cc][++$j]));
0865 }
0866
0867 // Add the starter to the normalized sequence, followed by non-starters in canonical order
0868 $nf_seq .= $starter;
0869
0870 foreach ($utf_sort as $utf_chars)
0871 {
0872 if (!empty($utf_chars))
0873 {
0874 $nf_seq .= implode('', $utf_chars);
0875 }
0876 }
0877
0878 // Reset the array and go on
0879 $utf_sort = array();
0880 $starter = $utf_char;
0881 }
0882 }
0883 }
0884 while ($k <= $k_max);
0885
0886 $tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $nf_seq;
0887 $tmp_pos = $pos;
0888 }
0889 else
0890 {
0891 // Only a ASCII char can make the program get here
0892 //
0893 // First we skip the current byte with ++$pos, then we quickly skip following ASCII chars with strspn().
0894 //
0895 // The first two "if"'s here can be removed, with the consequences of being faster on latin text (lots of ASCII) and slower on
0896 // multi-byte text (where the only ASCII chars are spaces and punctuation)
0897 if (++$pos != $len)
0898 {
0899 if ($str[$pos] < "\x80")
0900 {
0901 $pos += strspn($str, UTF8_ASCII_RANGE, ++$pos);
0902 $buffer[++$i & 7] = $str[$pos - 1];
0903 }
0904 else
0905 {
0906 $buffer[++$i & 7] = $c;
0907 }
0908 }
0909 }
0910 }
0911 while ($pos < $len);
0912
0913 // Now is time to return the string
0914 if ($tmp_pos)
0915 {
0916 // If the $tmp_pos cursor is not at the beggining of the string then at least one character was not in normal form. Replace $str with the fixed version
0917 if ($tmp_pos == $len)
0918 {
0919 // The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str
0920 return $tmp;
0921 }
0922 else
0923 {
0924 // The rightmost chunk of $str has not been appended to $tmp yet
0925 return $tmp . substr($str, $tmp_pos);
0926 }
0927 }
0928
0929 // The string was already in normal form
0930 return $str;
0931 }
0932
0933 /**
0934 * Decompose a UTF string
0935 *
0936 * @param string $str UTF string
0937 * @param integer $pos Position of the first UTF char (in bytes)
0938 * @param integer $len Length of the string (in bytes)
0939 * @param array &$decomp_map Decomposition mapping, passed by reference but never modified
0940 * @return string The string, decomposed and sorted canonically
0941 *
0942 * @access private
0943 */
0944 static function decompose($str, $pos, $len, &$decomp_map)
0945 {
0946 global $utf_combining_class;
0947
0948 // Load some commonly-used tables
0949 if (!isset($utf_combining_class))
0950 {
0951 global $phpbb_root_path, $phpEx;
0952 include($phpbb_root_path . 'includes/utf/data/utf_normalizer_common.' . $phpEx);
0953 }
0954
0955 // UTF char length array
0956 $utf_len_mask = array(
0957 // Leading bytes masks
0958 "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,
0959 // Trailing bytes masks
0960 "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0
0961 );
0962
0963 // Some extra checks are triggered on the first byte of a UTF sequence
0964 $extra_check = array(
0965 "\xED" => 1, "\xEF" => 1, "\xC0" => 1, "\xC1" => 1, "\xE0" => 1, "\xF0" => 1,
0966 "\xF4" => 1, "\xF5" => 1, "\xF6" => 1, "\xF7" => 1, "\xF8" => 1, "\xF9" => 1,
0967 "\xFA" => 1, "\xFB" => 1, "\xFC" => 1, "\xFD" => 1, "\xFE" => 1, "\xFF" => 1
0968 );
0969
0970 // These masks are used to check if a UTF sequence is well formed. Here are the only 3 lengths we acknowledge:
0971 // - 2-byte: 110? ???? 10?? ????
0972 // - 3-byte: 1110 ???? 10?? ???? 10?? ????
0973 // - 4-byte: 1111 0??? 10?? ???? 10?? ???? 10?? ????
0974 // Note that 5- and 6- byte sequences are automatically discarded
0975 $utf_validation_mask = array(
0976 2 => "\xE0\xC0",
0977 3 => "\xF0\xC0\xC0",
0978 4 => "\xF8\xC0\xC0\xC0"
0979 );
0980
0981 $utf_validation_check = array(
0982 2 => "\xC0\x80",
0983 3 => "\xE0\x80\x80",
0984 4 => "\xF0\x80\x80\x80"
0985 );
0986
0987 $tmp = '';
0988 $starter_pos = $pos;
0989 $tmp_pos = $last_cc = $sort = $dump = 0;
0990 $utf_sort = array();
0991
0992 // Main loop
0993 do
0994 {
0995 // STEP 0: Capture the current char
0996
0997 $cur_mask = $str[$pos] & "\xF0";
0998 if (isset($utf_len_mask[$cur_mask]))
0999 {
1000 if ($utf_len = $utf_len_mask[$cur_mask])
1001 {
1002 // Multibyte char
1003 $utf_char = substr($str, $pos, $utf_len);
1004 $pos += $utf_len;
1005 }
1006 else
1007 {
1008 // A trailing byte came out of nowhere, we will treat it and all following trailing bytes as if each of them was a Unicode
1009 // replacement char and we will advance the cursor
1010 $spn = strspn($str, UTF8_TRAILING_BYTES, $pos);
1011
1012 if ($dump)
1013 {
1014 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1015
1016 // Dump combiners
1017 if (!empty($utf_sort))
1018 {
1019 if ($sort)
1020 {
1021 ksort($utf_sort);
1022 }
1023
1024 foreach ($utf_sort as $utf_chars)
1025 {
1026 $tmp .= implode('', $utf_chars);
1027 }
1028 }
1029
1030 $tmp .= str_repeat(UTF8_REPLACEMENT, $spn);
1031 $dump = $sort = 0;
1032 }
1033 else
1034 {
1035 $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn);
1036 }
1037
1038 $pos += $spn;
1039 $tmp_pos = $starter_pos = $pos;
1040
1041 $utf_sort = array();
1042 $last_cc = 0;
1043
1044 continue;
1045 }
1046
1047 // STEP 1: Decide what to do with current char
1048
1049 // Now, in that order:
1050 // - check if that character is decomposable
1051 // - check if that character is a non-starter
1052 // - check if that character requires extra checks to be performed
1053 if (isset($decomp_map[$utf_char]))
1054 {
1055 // Decompose the char
1056 $_pos = 0;
1057 $_len = strlen($decomp_map[$utf_char]);
1058
1059 do
1060 {
1061 $c = $decomp_map[$utf_char][$_pos];
1062 $_utf_len =& $utf_len_mask[$c & "\xF0"];
1063
1064 if (isset($_utf_len))
1065 {
1066 $_utf_char = substr($decomp_map[$utf_char], $_pos, $_utf_len);
1067 $_pos += $_utf_len;
1068
1069 if (isset($utf_combining_class[$_utf_char]))
1070 {
1071 // The character decomposed to a non-starter, buffer it for sorting
1072 $utf_sort[$utf_combining_class[$_utf_char]][] = $_utf_char;
1073
1074 if ($utf_combining_class[$_utf_char] < $last_cc)
1075 {
1076 // Not canonically ordered, will require sorting
1077 $sort = $dump = 1;
1078 }
1079 else
1080 {
1081 $dump = 1;
1082 $last_cc = $utf_combining_class[$_utf_char];
1083 }
1084 }
1085 else
1086 {
1087 // This character decomposition contains a starter, dump the buffer and continue
1088 if ($dump)
1089 {
1090 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1091
1092 // Dump combiners
1093 if (!empty($utf_sort))
1094 {
1095 if ($sort)
1096 {
1097 ksort($utf_sort);
1098 }
1099
1100 foreach ($utf_sort as $utf_chars)
1101 {
1102 $tmp .= implode('', $utf_chars);
1103 }
1104 }
1105
1106 $tmp .= $_utf_char;
1107 $dump = $sort = 0;
1108 }
1109 else
1110 {
1111 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos) . $_utf_char;
1112 }
1113
1114 $tmp_pos = $starter_pos = $pos;
1115 $utf_sort = array();
1116 $last_cc = 0;
1117 }
1118 }
1119 else
1120 {
1121 // This character decomposition contains an ASCII char, which is a starter. Dump the buffer and continue
1122 ++$_pos;
1123
1124 if ($dump)
1125 {
1126 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1127
1128 // Dump combiners
1129 if (!empty($utf_sort))
1130 {
1131 if ($sort)
1132 {
1133 ksort($utf_sort);
1134 }
1135
1136 foreach ($utf_sort as $utf_chars)
1137 {
1138 $tmp .= implode('', $utf_chars);
1139 }
1140 }
1141
1142 $tmp .= $c;
1143 $dump = $sort = 0;
1144 }
1145 else
1146 {
1147 $tmp .= substr($str, $tmp_pos, $pos - $utf_len - $tmp_pos) . $c;
1148 }
1149
1150 $tmp_pos = $starter_pos = $pos;
1151 $utf_sort = array();
1152 $last_cc = 0;
1153 }
1154 }
1155 while ($_pos < $_len);
1156 }
1157 else if (isset($utf_combining_class[$utf_char]))
1158 {
1159 // Combining character
1160 if ($utf_combining_class[$utf_char] < $last_cc)
1161 {
1162 // Not in canonical order
1163 $sort = $dump = 1;
1164 }
1165 else
1166 {
1167 $last_cc = $utf_combining_class[$utf_char];
1168 }
1169
1170 $utf_sort[$utf_combining_class[$utf_char]][] = $utf_char;
1171 }
1172 else
1173 {
1174 // Non-decomposable starter, check out if it's a Hangul syllable
1175 if ($utf_char < UTF8_HANGUL_FIRST || $utf_char > UTF8_HANGUL_LAST)
1176 {
1177 // Nope, regular UTF char, check that we have the correct number of trailing bytes
1178 if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len])
1179 {
1180 // Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char
1181 // has been encoded in a five- or six- byte sequence.
1182 // Move the cursor back to its original position then advance it to the position it should really be at
1183 $pos -= $utf_len;
1184 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1185
1186 if (!empty($utf_sort))
1187 {
1188 ksort($utf_sort);
1189
1190 foreach ($utf_sort as $utf_chars)
1191 {
1192 $tmp .= implode('', $utf_chars);
1193 }
1194 $utf_sort = array();
1195 }
1196
1197 // Add a replacement char then another replacement char for every trailing byte.
1198 //
1199 // @todo I'm not entirely sure that's how we're supposed to mark invalidated byte sequences, check this
1200 $spn = strspn($str, UTF8_TRAILING_BYTES, ++$pos);
1201 $tmp .= str_repeat(UTF8_REPLACEMENT, $spn + 1);
1202
1203 $dump = $sort = 0;
1204
1205 $pos += $spn;
1206 $tmp_pos = $pos;
1207 continue;
1208 }
1209
1210 if (isset($extra_check[$utf_char[0]]))
1211 {
1212 switch ($utf_char[0])
1213 {
1214 // Note: 0xED is quite common in Korean
1215 case "\xED":
1216 if ($utf_char >= "\xED\xA0\x80")
1217 {
1218 // Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF)
1219 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1220
1221 if (!empty($utf_sort))
1222 {
1223 ksort($utf_sort);
1224
1225 foreach ($utf_sort as $utf_chars)
1226 {
1227 $tmp .= implode('', $utf_chars);
1228 }
1229 $utf_sort = array();
1230 }
1231
1232 $tmp .= UTF8_REPLACEMENT;
1233 $dump = $sort = 0;
1234
1235 $tmp_pos = $starter_pos = $pos;
1236 continue 2;
1237 }
1238 break;
1239
1240 // Note: 0xEF is quite common in Japanese
1241 case "\xEF":
1242 if ($utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF")
1243 {
1244 // U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF)
1245 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1246
1247 if (!empty($utf_sort))
1248 {
1249 ksort($utf_sort);
1250
1251 foreach ($utf_sort as $utf_chars)
1252 {
1253 $tmp .= implode('', $utf_chars);
1254 }
1255 $utf_sort = array();
1256 }
1257
1258 $tmp .= UTF8_REPLACEMENT;
1259 $dump = $sort = 0;
1260
1261 $tmp_pos = $starter_pos = $pos;
1262 continue 2;
1263 }
1264 break;
1265
1266 case "\xC0":
1267 case "\xC1":
1268 if ($utf_char <= "\xC1\xBF")
1269 {
1270 // Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char
1271 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1272
1273 if (!empty($utf_sort))
1274 {
1275 ksort($utf_sort);
1276
1277 foreach ($utf_sort as $utf_chars)
1278 {
1279 $tmp .= implode('', $utf_chars);
1280 }
1281 $utf_sort = array();
1282 }
1283
1284 $tmp .= UTF8_REPLACEMENT;
1285 $dump = $sort = 0;
1286
1287 $tmp_pos = $starter_pos = $pos;
1288 continue 2;
1289 }
1290 break;
1291
1292 case "\xE0":
1293 if ($utf_char <= "\xE0\x9F\xBF")
1294 {
1295 // Unicode char U+0000..U+07FF encoded in 3 bytes
1296 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1297
1298 if (!empty($utf_sort))
1299 {
1300 ksort($utf_sort);
1301
1302 foreach ($utf_sort as $utf_chars)
1303 {
1304 $tmp .= implode('', $utf_chars);
1305 }
1306 $utf_sort = array();
1307 }
1308
1309 $tmp .= UTF8_REPLACEMENT;
1310 $dump = $sort = 0;
1311
1312 $tmp_pos = $starter_pos = $pos;
1313 continue 2;
1314 }
1315 break;
1316
1317 case "\xF0":
1318 if ($utf_char <= "\xF0\x8F\xBF\xBF")
1319 {
1320 // Unicode char U+0000..U+FFFF encoded in 4 bytes
1321 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1322
1323 if (!empty($utf_sort))
1324 {
1325 ksort($utf_sort);
1326
1327 foreach ($utf_sort as $utf_chars)
1328 {
1329 $tmp .= implode('', $utf_chars);
1330 }
1331 $utf_sort = array();
1332 }
1333
1334 $tmp .= UTF8_REPLACEMENT;
1335 $dump = $sort = 0;
1336
1337 $tmp_pos = $starter_pos = $pos;
1338 continue 2;
1339 }
1340 break;
1341
1342 default:
1343 if ($utf_char > UTF8_MAX)
1344 {
1345 // Out of the Unicode range
1346 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1347
1348 if (!empty($utf_sort))
1349 {
1350 ksort($utf_sort);
1351
1352 foreach ($utf_sort as $utf_chars)
1353 {
1354 $tmp .= implode('', $utf_chars);
1355 }
1356 $utf_sort = array();
1357 }
1358
1359 $tmp .= UTF8_REPLACEMENT;
1360 $dump = $sort = 0;
1361
1362 $tmp_pos = $starter_pos = $pos;
1363 continue 2;
1364 }
1365 break;
1366 }
1367 }
1368 }
1369 else
1370 {
1371 // Hangul syllable
1372 $idx = (((ord($utf_char[0]) & 0x0F) << 12) | ((ord($utf_char[1]) & 0x3F) << 6) | (ord($utf_char[2]) & 0x3F)) - UNICODE_HANGUL_SBASE;
1373
1374 // LIndex can only range from 0 to 18, therefore it cannot influence the first two bytes of the L Jamo, which allows us to hardcode them (based on LBase).
1375 //
1376 // The same goes for VIndex, but for TIndex there's a catch: the value of the third byte could exceed 0xBF and we would have to increment the second byte
1377 if ($t_index = $idx % UNICODE_HANGUL_TCOUNT)
1378 {
1379 if ($t_index < 25)
1380 {
1381 $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x86\x00";
1382 $utf_char[8] = chr(0xA7 + $t_index);
1383 }
1384 else
1385 {
1386 $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x87\x00";
1387 $utf_char[8] = chr(0x67 + $t_index);
1388 }
1389 }
1390 else
1391 {
1392 $utf_char = "\xE1\x84\x00\xE1\x85\x00";
1393 }
1394
1395 $utf_char[2] = chr(0x80 + (int) ($idx / UNICODE_HANGUL_NCOUNT));
1396 $utf_char[5] = chr(0xA1 + (int) (($idx % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT));
1397
1398 // Just like other decompositions, the resulting Jamos must be dumped to the tmp string
1399 $dump = 1;
1400 }
1401
1402 // Do we need to dump stuff to the tmp string?
1403 if ($dump)
1404 {
1405 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1406
1407 // Dump combiners
1408 if (!empty($utf_sort))
1409 {
1410 if ($sort)
1411 {
1412 ksort($utf_sort);
1413 }
1414
1415 foreach ($utf_sort as $utf_chars)
1416 {
1417 $tmp .= implode('', $utf_chars);
1418 }
1419 }
1420
1421 $tmp .= $utf_char;
1422 $dump = $sort = 0;
1423 $tmp_pos = $pos;
1424 }
1425
1426 $last_cc = 0;
1427 $utf_sort = array();
1428 $starter_pos = $pos;
1429 }
1430 }
1431 else
1432 {
1433 // ASCII char, which happens to be a starter (as any other ASCII char)
1434 if ($dump)
1435 {
1436 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1437
1438 // Dump combiners
1439 if (!empty($utf_sort))
1440 {
1441 if ($sort)
1442 {
1443 ksort($utf_sort);
1444 }
1445
1446 foreach ($utf_sort as $utf_chars)
1447 {
1448 $tmp .= implode('', $utf_chars);
1449 }
1450 }
1451
1452 $tmp .= $str[$pos];
1453 $dump = $sort = 0;
1454 $tmp_pos = ++$pos;
1455
1456 $pos += strspn($str, UTF8_ASCII_RANGE, $pos);
1457 }
1458 else
1459 {
1460 $pos += strspn($str, UTF8_ASCII_RANGE, ++$pos);
1461 }
1462
1463 $last_cc = 0;
1464 $utf_sort = array();
1465 $starter_pos = $pos;
1466 }
1467 }
1468 while ($pos < $len);
1469
1470 // Now is time to return the string
1471 if ($dump)
1472 {
1473 $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1474
1475 // Dump combiners
1476 if (!empty($utf_sort))
1477 {
1478 if ($sort)
1479 {
1480 ksort($utf_sort);
1481 }
1482
1483 foreach ($utf_sort as $utf_chars)
1484 {
1485 $tmp .= implode('', $utf_chars);
1486 }
1487 }
1488
1489 return $tmp;
1490 }
1491 else if ($tmp_pos)
1492 {
1493 // If the $tmp_pos cursor was moved then at least one character was not in normal form. Replace $str with the fixed version
1494 if ($tmp_pos == $len)
1495 {
1496 // The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str
1497 return $tmp;
1498 }
1499 else
1500 {
1501 // The rightmost chunk of $str has not been appended to $tmp yet
1502 return $tmp . substr($str, $tmp_pos);
1503 }
1504 }
1505
1506 // The string was already in normal form
1507 return $str;
1508 }
1509 }
1510