Verzeichnisstruktur phpBB-3.1.0


Veröffentlicht
27.10.2014

So funktioniert es


Auf das letzte Element klicken. Dies geht jeweils ein Schritt zurück

Auf das Icon klicken, dies öffnet das Verzeichnis. Nochmal klicken schließt das Verzeichnis.
Auf den Verzeichnisnamen klicken, dies zeigt nur das Verzeichnis mit Inhalt an

(Beispiel Datei-Icons)

Auf das Icon klicken um den Quellcode anzuzeigen

utf_normalizer.php

Zuletzt modifiziert: 09.10.2024, 12:52 - Dateigröße: 41.87 KiB


0001  <?php
0002  /**
0003  *
0004  * This file is part of the phpBB Forum Software package.
0005  *
0006  * @copyright (c) phpBB Limited <https://www.phpbb.com>
0007  * @license GNU General Public License, version 2 (GPL-2.0)
0008  *
0009  * For full copyright and license information, please see
0010  * the docs/CREDITS.txt file.
0011  *
0012  */
0013   
0014  /**
0015  */
0016  if (!defined('IN_PHPBB'))
0017  {
0018      exit;
0019  }
0020   
0021  /**
0022  * Some Unicode characters encoded in UTF-8
0023  *
0024  * Preserved for compatibility
0025  */
0026  define('UTF8_REPLACEMENT', "\xEF\xBF\xBD");
0027  define('UTF8_MAX', "\xF4\x8F\xBF\xBF");
0028  define('UTF8_FFFE', "\xEF\xBF\xBE");
0029  define('UTF8_FFFF', "\xEF\xBF\xBF");
0030  define('UTF8_SURROGATE_FIRST', "\xED\xA0\x80");
0031  define('UTF8_SURROGATE_LAST', "\xED\xBF\xBF");
0032  define('UTF8_HANGUL_FIRST', "\xEA\xB0\x80");
0033  define('UTF8_HANGUL_LAST', "\xED\x9E\xA3");
0034   
0035  define('UTF8_CJK_FIRST', "\xE4\xB8\x80");
0036  define('UTF8_CJK_LAST', "\xE9\xBE\xBB");
0037  define('UTF8_CJK_B_FIRST', "\xF0\xA0\x80\x80");
0038  define('UTF8_CJK_B_LAST', "\xF0\xAA\x9B\x96");
0039   
0040  // Unset global variables
0041  unset($GLOBALS['utf_jamo_index'], $GLOBALS['utf_jamo_type'], $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_combining_class'], $GLOBALS['utf_canonical_comp'], $GLOBALS['utf_canonical_decomp'], $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);
0042   
0043  // NFC_QC and NFKC_QC values
0044  define('UNICODE_QC_MAYBE', 0);
0045  define('UNICODE_QC_NO', 1);
0046   
0047  // Contains all the ASCII characters appearing in UTF-8, sorted by frequency
0048  define('UTF8_ASCII_RANGE', "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F");
0049   
0050  // Contains all the tail bytes that can appear in the composition of a UTF-8 char
0051  define('UTF8_TRAILING_BYTES', "\xA9\xA0\xA8\x80\xAA\x99\xA7\xBB\xAB\x89\x94\x82\xB4\xA2\xAE\x83\xB0\xB9\xB8\x93\xAF\xBC\xB3\x81\xA4\xB2\x9C\xA1\xB5\xBE\xBD\xBA\x98\xAD\xB1\x84\x95\xA6\xB6\x88\x8D\x90\xB7\xBF\x92\x85\xA5\x97\x8C\x86\xA3\x8E\x9F\x8F\x87\x91\x9D\xAC\x9E\x8B\x96\x9B\x8A\x9A");
0052   
0053  // Constants used by the Hangul [de]composition algorithms
0054  define('UNICODE_HANGUL_SBASE', 0xAC00);
0055  define('UNICODE_HANGUL_LBASE', 0x1100);
0056  define('UNICODE_HANGUL_VBASE', 0x1161);
0057  define('UNICODE_HANGUL_TBASE', 0x11A7);
0058  define('UNICODE_HANGUL_SCOUNT', 11172);
0059  define('UNICODE_HANGUL_LCOUNT', 19);
0060  define('UNICODE_HANGUL_VCOUNT', 21);
0061  define('UNICODE_HANGUL_TCOUNT', 28);
0062  define('UNICODE_HANGUL_NCOUNT', 588);
0063  define('UNICODE_JAMO_L', 0);
0064  define('UNICODE_JAMO_V', 1);
0065  define('UNICODE_JAMO_T', 2);
0066   
0067  /**
0068  * Unicode normalization routines
0069  */
0070  class utf_normalizer
0071  {
0072      /**
0073      * Validate, cleanup and normalize a string
0074      *
0075      * The ultimate convenience function! Clean up invalid UTF-8 sequences,
0076      * and convert to Normal Form C, canonical composition.
0077      *
0078      * @param    string    &$str    The dirty string
0079      * @return    string            The same string, all shiny and cleaned-up
0080      */
0081      static function cleanup(&$str)
0082      {
0083          // The string below is the list of all autorized characters, sorted by frequency in latin text
0084          $pos = strspn($str, "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x0D");
0085          $len = strlen($str);
0086   
0087          if ($pos == $len)
0088          {
0089              // ASCII strings with no special chars return immediately
0090              return;
0091          }
0092   
0093          // Note: we do not check for $GLOBALS['utf_canonical_decomp']. It is assumed they are always loaded together
0094          if (!isset($GLOBALS['utf_nfc_qc']))
0095          {
0096              global $phpbb_root_path, $phpEx;
0097              include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);
0098          }
0099   
0100          if (!isset($GLOBALS['utf_canonical_decomp']))
0101          {
0102              global $phpbb_root_path, $phpEx;
0103              include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
0104          }
0105   
0106          // Replace any byte in the range 0x00..0x1F, except for \r, \n and \t
0107          // We replace those characters with a 0xFF byte, which is illegal in UTF-8 and will in turn be replaced with a UTF replacement char
0108          $str = strtr(
0109              $str,
0110              "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",
0111              "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"
0112          );
0113   
0114          $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
0115      }
0116   
0117      /**
0118      * Validate and normalize a UTF string to NFC
0119      *
0120      * @param    string    &$str    Unchecked UTF string
0121      * @return    string            The string, validated and in normal form
0122      */
0123      static function nfc(&$str)
0124      {
0125          $pos = strspn($str, UTF8_ASCII_RANGE);
0126          $len = strlen($str);
0127   
0128          if ($pos == $len)
0129          {
0130              // ASCII strings return immediately
0131              return;
0132          }
0133   
0134          if (!isset($GLOBALS['utf_nfc_qc']))
0135          {
0136              global $phpbb_root_path, $phpEx;
0137              include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);
0138          }
0139   
0140          if (!isset($GLOBALS['utf_canonical_decomp']))
0141          {
0142              global $phpbb_root_path, $phpEx;
0143              include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
0144          }
0145   
0146          $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
0147      }
0148   
0149      /**
0150      * Validate and normalize a UTF string to NFKC
0151      *
0152      * @param    string    &$str    Unchecked UTF string
0153      * @return    string            The string, validated and in normal form
0154      */
0155      static function nfkc(&$str)
0156      {
0157          $pos = strspn($str, UTF8_ASCII_RANGE);
0158          $len = strlen($str);
0159   
0160          if ($pos == $len)
0161          {
0162              // ASCII strings return immediately
0163              return;
0164          }
0165   
0166          if (!isset($GLOBALS['utf_nfkc_qc']))
0167          {
0168              global $phpbb_root_path, $phpEx;
0169              include($phpbb_root_path . 'includes/utf/data/utf_nfkc_qc.' . $phpEx);
0170          }
0171   
0172          if (!isset($GLOBALS['utf_compatibility_decomp']))
0173          {
0174              global $phpbb_root_path, $phpEx;
0175              include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);
0176          }
0177   
0178          $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);
0179      }
0180   
0181      /**
0182      * Validate and normalize a UTF string to NFD
0183      *
0184      * @param    string    &$str    Unchecked UTF string
0185      * @return    string            The string, validated and in normal form
0186      */
0187      static function nfd(&$str)
0188      {
0189          $pos = strspn($str, UTF8_ASCII_RANGE);
0190          $len = strlen($str);
0191   
0192          if ($pos == $len)
0193          {
0194              // ASCII strings return immediately
0195              return;
0196          }
0197   
0198          if (!isset($GLOBALS['utf_canonical_decomp']))
0199          {
0200              global $phpbb_root_path, $phpEx;
0201              include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
0202          }
0203   
0204          $str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_canonical_decomp']);
0205      }
0206   
0207      /**
0208      * Validate and normalize a UTF string to NFKD
0209      *
0210      * @param    string    &$str    Unchecked UTF string
0211      * @return    string            The string, validated and in normal form
0212      */
0213      static function nfkd(&$str)
0214      {
0215          $pos = strspn($str, UTF8_ASCII_RANGE);
0216          $len = strlen($str);
0217   
0218          if ($pos == $len)
0219          {
0220              // ASCII strings return immediately
0221              return;
0222          }
0223   
0224          if (!isset($GLOBALS['utf_compatibility_decomp']))
0225          {
0226              global $phpbb_root_path, $phpEx;
0227              include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);
0228          }
0229   
0230          $str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_compatibility_decomp']);
0231      }
0232   
0233   
0234      /**
0235      * Recompose a UTF string
0236      *
0237      * @param    string    $str            Unchecked UTF string
0238      * @param    integer    $pos            Position of the first UTF char (in bytes)
0239      * @param    integer    $len            Length of the string (in bytes)
0240      * @param    array    &$qc            Quick-check array, passed by reference but never modified
0241      * @param    array    &$decomp_map    Decomposition mapping, passed by reference but never modified
0242      * @return    string                    The string, validated and recomposed
0243      *
0244      * @access    private
0245      */
0246      static function recompose($str, $pos, $len, &$qc, &$decomp_map)
0247      {
0248          global $utf_combining_class, $utf_canonical_comp, $utf_jamo_type, $utf_jamo_index;
0249   
0250          // Load some commonly-used tables
0251          if (!isset($utf_jamo_index, $utf_jamo_type, $utf_combining_class))
0252          {
0253              global $phpbb_root_path, $phpEx;
0254              include($phpbb_root_path . 'includes/utf/data/utf_normalizer_common.' . $phpEx);
0255          }
0256   
0257          // Load the canonical composition table
0258          if (!isset($utf_canonical_comp))
0259          {
0260              global $phpbb_root_path, $phpEx;
0261              include($phpbb_root_path . 'includes/utf/data/utf_canonical_comp.' . $phpEx);
0262          }
0263   
0264          // Buffer the last ASCII char before the UTF-8 stuff if applicable
0265          $tmp = '';
0266          $i = $tmp_pos = $last_cc = 0;
0267   
0268          $buffer = ($pos) ? array(++$i => $str[$pos - 1]) : array();
0269   
0270          // UTF char length array
0271          // This array is used to determine the length of a UTF character.
0272          // Be $c the result of ($str[$pos] & "\xF0") --where $str is the string we're operating on and $pos
0273          // the position of the cursor--, if $utf_len_mask[$c] does not exist, the byte is an ASCII char.
0274          // Otherwise, if $utf_len_mask[$c] is greater than 0, we have a the leading byte of a multibyte character
0275          // whose length is $utf_len_mask[$c] and if it is equal to 0, the byte is a trailing byte.
0276          $utf_len_mask = array(
0277              // Leading bytes masks
0278              "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,
0279              // Trailing bytes masks
0280              "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0
0281          );
0282   
0283          $extra_check = array(
0284              "\xED" => 1, "\xEF" => 1, "\xC0" => 1, "\xC1" => 1, "\xE0" => 1, "\xF0" => 1,
0285              "\xF4" => 1, "\xF5" => 1, "\xF6" => 1, "\xF7" => 1, "\xF8" => 1, "\xF9" => 1,
0286              "\xFA" => 1, "\xFB" => 1, "\xFC" => 1, "\xFD" => 1, "\xFE" => 1, "\xFF" => 1
0287          );
0288   
0289          $utf_validation_mask = array(
0290              2    => "\xE0\xC0",
0291              3    => "\xF0\xC0\xC0",
0292              4    => "\xF8\xC0\xC0\xC0"
0293          );
0294   
0295          $utf_validation_check = array(
0296              2    => "\xC0\x80",
0297              3    => "\xE0\x80\x80",
0298              4    => "\xF0\x80\x80\x80"
0299          );
0300   
0301          // Main loop
0302          do
0303          {
0304              // STEP 0: Capture the current char and buffer it
0305              $c = $str[$pos];
0306              $c_mask = $c & "\xF0";
0307   
0308              if (isset($utf_len_mask[$c_mask]))
0309              {
0310                  // Byte at $pos is either a leading byte or a missplaced trailing byte
0311                  if ($utf_len = $utf_len_mask[$c_mask])
0312                  {
0313                      // Capture the char
0314                      $buffer[++$i & 7] = $utf_char = substr($str, $pos, $utf_len);
0315   
0316                      // Let's find out if a thorough check is needed
0317                      if (isset($qc[$utf_char]))
0318                      {
0319                          // If the UTF char is in the qc array then it may not be in normal form. We do nothing here, the actual processing is below this "if" block
0320                      }
0321                      else if (isset($utf_combining_class[$utf_char]))
0322                      {
0323                          if ($utf_combining_class[$utf_char] < $last_cc)
0324                          {
0325                              // A combining character that is NOT canonically ordered
0326                          }
0327                          else
0328                          {
0329                              // A combining character that IS canonically ordered, skip to the next char
0330                              $last_cc = $utf_combining_class[$utf_char];
0331   
0332                              $pos += $utf_len;
0333                              continue;
0334                          }
0335                      }
0336                      else
0337                      {
0338                          // At this point, $utf_char holds a UTF char that we know is not a NF[K]C_QC and is not a combining character.
0339                          // It can be a singleton, a canonical composite, a replacement char or an even an ill-formed bunch of bytes. Let's find out
0340                          $last_cc = 0;
0341   
0342                          // Check that we have the correct number of trailing bytes
0343                          if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len])
0344                          {
0345                              // Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char
0346                              // has been encoded in a five- or six- byte sequence
0347                              if ($utf_char[0] >= "\xF8")
0348                              {
0349                                  if ($utf_char[0] < "\xFC")
0350                                  {
0351                                      $trailing_bytes = 4;
0352                                  }
0353                                  else if ($utf_char[0] > "\xFD")
0354                                  {
0355                                      $trailing_bytes = 0;
0356                                  }
0357                                  else
0358                                  {
0359                                      $trailing_bytes = 5;
0360                                  }
0361                              }
0362                              else
0363                              {
0364                                  $trailing_bytes = $utf_len - 1;
0365                              }
0366   
0367                              $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
0368                              $pos += strspn($str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes);
0369                              $tmp_pos = $pos;
0370   
0371                              continue;
0372                          }
0373   
0374                          if (isset($extra_check[$c]))
0375                          {
0376                              switch ($c)
0377                              {
0378                                  // Note: 0xED is quite common in Korean
0379                                  case "\xED":
0380                                      if ($utf_char >= "\xED\xA0\x80")
0381                                      {
0382                                          // Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF)
0383                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
0384                                          $pos += $utf_len;
0385                                          $tmp_pos = $pos;
0386                                          continue 2;
0387                                      }
0388                                  break;
0389   
0390                                  // Note: 0xEF is quite common in Japanese
0391                                  case "\xEF":
0392                                      if ($utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF")
0393                                      {
0394                                          // U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF)
0395                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
0396                                          $pos += $utf_len;
0397                                          $tmp_pos = $pos;
0398                                          continue 2;
0399                                      }
0400                                  break;
0401   
0402                                  case "\xC0":
0403                                  case "\xC1":
0404                                      if ($utf_char <= "\xC1\xBF")
0405                                      {
0406                                          // Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char
0407                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
0408                                          $pos += $utf_len;
0409                                          $tmp_pos = $pos;
0410                                          continue 2;
0411                                      }
0412                                  break;
0413   
0414                                  case "\xE0":
0415                                      if ($utf_char <= "\xE0\x9F\xBF")
0416                                      {
0417                                          // Unicode char U+0000..U+07FF encoded in 3 bytes
0418                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
0419                                          $pos += $utf_len;
0420                                          $tmp_pos = $pos;
0421                                          continue 2;
0422                                      }
0423                                  break;
0424   
0425                                  case "\xF0":
0426                                      if ($utf_char <= "\xF0\x8F\xBF\xBF")
0427                                      {
0428                                          // Unicode char U+0000..U+FFFF encoded in 4 bytes
0429                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
0430                                          $pos += $utf_len;
0431                                          $tmp_pos = $pos;
0432                                          continue 2;
0433                                      }
0434                                  break;
0435   
0436                                  default:
0437                                      // Five- and six- byte sequences do not need being checked for here anymore
0438                                      if ($utf_char > UTF8_MAX)
0439                                      {
0440                                          // Out of the Unicode range
0441                                          if ($utf_char[0] < "\xF8")
0442                                          {
0443                                              $trailing_bytes = 3;
0444                                          }
0445                                          else if ($utf_char[0] < "\xFC")
0446                                          {
0447                                              $trailing_bytes = 4;
0448                                          }
0449                                          else if ($utf_char[0] > "\xFD")
0450                                          {
0451                                              $trailing_bytes = 0;
0452                                          }
0453                                          else
0454                                          {
0455                                              $trailing_bytes = 5;
0456                                          }
0457   
0458                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
0459                                          $pos += strspn($str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes);
0460                                          $tmp_pos = $pos;
0461                                          continue 2;
0462                                      }
0463                                  break;
0464                              }
0465                          }
0466   
0467                          // The char is a valid starter, move the cursor and go on
0468                          $pos += $utf_len;
0469                          continue;
0470                      }
0471                  }
0472                  else
0473                  {
0474                      // A trailing byte came out of nowhere, we will advance the cursor and treat the this byte and all following trailing bytes as if
0475                      // each of them was a Unicode replacement char
0476                      $spn = strspn($str, UTF8_TRAILING_BYTES, $pos);
0477                      $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn);
0478   
0479                      $pos += $spn;
0480                      $tmp_pos = $pos;
0481                      continue;
0482                  }
0483   
0484                  // STEP 1: Decompose current char
0485   
0486                  // We have found a character that is either:
0487                  //  - in the NFC_QC/NFKC_QC list
0488                  //  - a non-starter char that is not canonically ordered
0489                  //
0490                  // We are going to capture the shortest UTF sequence that satisfies these two conditions:
0491                  //
0492                  //  1 - If the sequence does not start at the begginning of the string, it must begin with a starter,
0493                  // and that starter must not have the NF[K]C_QC property equal to "MAYBE"
0494                  //
0495                  //  2 - If the sequence does not end at the end of the string, it must end with a non-starter and be
0496                  // immediately followed by a starter that is not on the QC list
0497                  //
0498                  $utf_seq = array();
0499                  $last_cc = 0;
0500                  $lpos = $pos;
0501                  $pos += $utf_len;
0502   
0503                  if (isset($decomp_map[$utf_char]))
0504                  {
0505                      $_pos = 0;
0506                      $_len = strlen($decomp_map[$utf_char]);
0507   
0508                      do
0509                      {
0510                          $_utf_len =& $utf_len_mask[$decomp_map[$utf_char][$_pos] & "\xF0"];
0511   
0512                          if (isset($_utf_len))
0513                          {
0514                              $utf_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
0515                              $_pos += $_utf_len;
0516                          }
0517                          else
0518                          {
0519                              $utf_seq[] = $decomp_map[$utf_char][$_pos];
0520                              ++$_pos;
0521                          }
0522                      }
0523                      while ($_pos < $_len);
0524                  }
0525                  else
0526                  {
0527                      // The char is not decomposable
0528                      $utf_seq = array($utf_char);
0529                  }
0530   
0531                  // STEP 2: Capture the starter
0532   
0533                  // Check out the combining class of the first character of the UTF sequence
0534                  $k = 0;
0535                  if (isset($utf_combining_class[$utf_seq[0]]) || $qc[$utf_char] == UNICODE_QC_MAYBE)
0536                  {
0537                      // Not a starter, inspect previous characters
0538                      // The last 8 characters are kept in a buffer so that we don't have to capture them everytime.
0539                      // This is enough for all real-life strings but even if it wasn't, we can capture characters in backward mode,
0540                      // although it is slower than this method.
0541                      //
0542                      // In the following loop, $j starts at the previous buffered character ($i - 1, because current character is
0543                      // at offset $i) and process them in backward mode until we find a starter.
0544                      //
0545                      // $k is the index on each UTF character inside of our UTF sequence. At this time, $utf_seq contains one or more
0546                      // characters numbered 0 to n. $k starts at 0 and for each char we prepend we pre-decrement it and for numbering
0547                      $starter_found = 0;
0548                      $j_min = max(1, $i - 7);
0549   
0550                      for ($j = $i - 1; $j >= $j_min && $lpos > $tmp_pos; --$j)
0551                      {
0552                          $utf_char = $buffer[$j & 7];
0553                          $lpos -= strlen($utf_char);
0554   
0555                          if (isset($decomp_map[$utf_char]))
0556                          {
0557                              // The char is a composite, decompose for storage
0558                              $decomp_seq = array();
0559                              $_pos = 0;
0560                              $_len = strlen($decomp_map[$utf_char]);
0561   
0562                              do
0563                              {
0564                                  $c = $decomp_map[$utf_char][$_pos];
0565                                  $_utf_len =& $utf_len_mask[$c & "\xF0"];
0566   
0567                                  if (isset($_utf_len))
0568                                  {
0569                                      $decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
0570                                      $_pos += $_utf_len;
0571                                  }
0572                                  else
0573                                  {
0574                                      $decomp_seq[] = $c;
0575                                      ++$_pos;
0576                                  }
0577                              }
0578                              while ($_pos < $_len);
0579   
0580                              // Prepend the UTF sequence with our decomposed sequence
0581                              if (isset($decomp_seq[1]))
0582                              {
0583                                  // The char expanded into several chars
0584                                  $decomp_cnt = sizeof($decomp_seq);
0585   
0586                                  foreach ($decomp_seq as $decomp_i => $decomp_char)
0587                                  {
0588                                      $utf_seq[$k + $decomp_i - $decomp_cnt] = $decomp_char;
0589                                  }
0590                                  $k -= $decomp_cnt;
0591                              }
0592                              else
0593                              {
0594                                  // Decomposed to a single char, easier to prepend
0595                                  $utf_seq[--$k] = $decomp_seq[0];
0596                              }
0597                          }
0598                          else
0599                          {
0600                              $utf_seq[--$k] = $utf_char;
0601                          }
0602   
0603                          if (!isset($utf_combining_class[$utf_seq[$k]]))
0604                          {
0605                              // We have found our starter
0606                              $starter_found = 1;
0607                              break;
0608                          }
0609                      }
0610   
0611                      if (!$starter_found && $lpos > $tmp_pos)
0612                      {
0613                          // The starter was not found in the buffer, let's rewind some more
0614                          do
0615                          {
0616                              // $utf_len_mask contains the masks of both leading bytes and trailing bytes. If $utf_en > 0 then it's a leading byte, otherwise it's a trailing byte.
0617                              $c = $str[--$lpos];
0618                              $c_mask = $c & "\xF0";
0619   
0620                              if (isset($utf_len_mask[$c_mask]))
0621                              {
0622                                  // UTF byte
0623                                  if ($utf_len = $utf_len_mask[$c_mask])
0624                                  {
0625                                      // UTF *leading* byte
0626                                      $utf_char = substr($str, $lpos, $utf_len);
0627   
0628                                      if (isset($decomp_map[$utf_char]))
0629                                      {
0630                                          // Decompose the character
0631                                          $decomp_seq = array();
0632                                          $_pos = 0;
0633                                          $_len = strlen($decomp_map[$utf_char]);
0634   
0635                                          do
0636                                          {
0637                                              $c = $decomp_map[$utf_char][$_pos];
0638                                              $_utf_len =& $utf_len_mask[$c & "\xF0"];
0639   
0640                                              if (isset($_utf_len))
0641                                              {
0642                                                  $decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
0643                                                  $_pos += $_utf_len;
0644                                              }
0645                                              else
0646                                              {
0647                                                  $decomp_seq[] = $c;
0648                                                  ++$_pos;
0649                                              }
0650                                          }
0651                                          while ($_pos < $_len);
0652   
0653                                          // Prepend the UTF sequence with our decomposed sequence
0654                                          if (isset($decomp_seq[1]))
0655                                          {
0656                                              // The char expanded into several chars
0657                                              $decomp_cnt = sizeof($decomp_seq);
0658                                              foreach ($decomp_seq as $decomp_i => $utf_char)
0659                                              {
0660                                                  $utf_seq[$k + $decomp_i - $decomp_cnt] = $utf_char;
0661                                              }
0662                                              $k -= $decomp_cnt;
0663                                          }
0664                                          else
0665                                          {
0666                                              // Decomposed to a single char, easier to prepend
0667                                              $utf_seq[--$k] = $decomp_seq[0];
0668                                          }
0669                                      }
0670                                      else
0671                                      {
0672                                          $utf_seq[--$k] = $utf_char;
0673                                      }
0674                                  }
0675                              }
0676                              else
0677                              {
0678                                  // ASCII char
0679                                  $utf_seq[--$k] = $c;
0680                              }
0681                          }
0682                          while ($lpos > $tmp_pos);
0683                      }
0684                  }
0685   
0686                  // STEP 3: Capture following combining modifiers
0687   
0688                  while ($pos < $len)
0689                  {
0690                      $c_mask = $str[$pos] & "\xF0";
0691   
0692                      if (isset($utf_len_mask[$c_mask]))
0693                      {
0694                          if ($utf_len = $utf_len_mask[$c_mask])
0695                          {
0696                              $utf_char = substr($str, $pos, $utf_len);
0697                          }
0698                          else
0699                          {
0700                              // A trailing byte came out of nowhere
0701                              // Trailing bytes are replaced with Unicode replacement chars, we will just ignore it for now, break out of the loop
0702                              // as if it was a starter (replacement chars ARE starters) and let the next loop replace it
0703                              break;
0704                          }
0705   
0706                          if (isset($utf_combining_class[$utf_char]) || isset($qc[$utf_char]))
0707                          {
0708                              // Combining character, add it to the sequence and move the cursor
0709                              if (isset($decomp_map[$utf_char]))
0710                              {
0711                                  // Decompose the character
0712                                  $_pos = 0;
0713                                  $_len = strlen($decomp_map[$utf_char]);
0714   
0715                                  do
0716                                  {
0717                                      $c = $decomp_map[$utf_char][$_pos];
0718                                      $_utf_len =& $utf_len_mask[$c & "\xF0"];
0719   
0720                                      if (isset($_utf_len))
0721                                      {
0722                                          $utf_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
0723                                          $_pos += $_utf_len;
0724                                      }
0725                                      else
0726                                      {
0727                                          $utf_seq[] = $c;
0728                                          ++$_pos;
0729                                      }
0730                                  }
0731                                  while ($_pos < $_len);
0732                              }
0733                              else
0734                              {
0735                                  $utf_seq[] = $utf_char;
0736                              }
0737   
0738                              $pos += $utf_len;
0739                          }
0740                          else
0741                          {
0742                              // Combining class 0 and no QC, break out of the loop
0743                              // Note: we do not know if that character is valid. If it's not, the next iteration will replace it
0744                              break;
0745                          }
0746                      }
0747                      else
0748                      {
0749                          // ASCII chars are starters
0750                          break;
0751                      }
0752                  }
0753   
0754                  // STEP 4: Sort and combine
0755   
0756                  // Here we sort...
0757                  $k_max = $k + sizeof($utf_seq);
0758   
0759                  if (!$k && $k_max == 1)
0760                  {
0761                      // There is only one char in the UTF sequence, add it then jump to the next iteration of main loop
0762                          // Note: the two commented lines below can be enabled under PHP5 for a very small performance gain in most cases
0763  //                        if (substr_compare($str, $utf_seq[0], $lpos, $pos - $lpos))
0764  //                        {
0765                          $tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $utf_seq[0];
0766                          $tmp_pos = $pos;
0767  //                        }
0768   
0769                      continue;
0770                  }
0771   
0772                  // ...there we combine
0773                  if (isset($utf_combining_class[$utf_seq[$k]]))
0774                  {
0775                      $starter = $nf_seq = '';
0776                  }
0777                  else
0778                  {
0779                      $starter = $utf_seq[$k++];
0780                      $nf_seq = '';
0781                  }
0782                  $utf_sort = array();
0783   
0784                  // We add an empty char at the end of the UTF char sequence. It will act as a starter and trigger the sort/combine routine
0785                  // at the end of the string without altering it
0786                  $utf_seq[] = '';
0787   
0788                  do
0789                  {
0790                      $utf_char = $utf_seq[$k++];
0791   
0792                      if (isset($utf_combining_class[$utf_char]))
0793                      {
0794                          $utf_sort[$utf_combining_class[$utf_char]][] = $utf_char;
0795                      }
0796                      else
0797                      {
0798                          if (empty($utf_sort))
0799                          {
0800                              // No combining characters... check for a composite of the two starters
0801                              if (isset($utf_canonical_comp[$starter . $utf_char]))
0802                              {
0803                                  // Good ol' composite character
0804                                  $starter = $utf_canonical_comp[$starter . $utf_char];
0805                              }
0806                              else if (isset($utf_jamo_type[$utf_char]))
0807                              {
0808                                  // Current char is a composable jamo
0809                                  if (isset($utf_jamo_type[$starter]) && $utf_jamo_type[$starter] == UNICODE_JAMO_L && $utf_jamo_type[$utf_char] == UNICODE_JAMO_V)
0810                                  {
0811                                      // We have a L jamo followed by a V jamo, we are going to prefetch the next char to see if it's a T jamo
0812                                      if (isset($utf_jamo_type[$utf_seq[$k]]) && $utf_jamo_type[$utf_seq[$k]] == UNICODE_JAMO_T)
0813                                      {
0814                                          // L+V+T jamos, combine to a LVT Hangul syllable ($k is incremented)
0815                                          $cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char] + $utf_jamo_index[$utf_seq[$k]];
0816                                          ++$k;
0817                                      }
0818                                      else
0819                                      {
0820                                          // L+V jamos, combine to a LV Hangul syllable
0821                                          $cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char];
0822                                      }
0823   
0824                                      $starter = chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
0825                                  }
0826                                  else
0827                                  {
0828                                      // Non-composable jamo, just add it to the sequence
0829                                      $nf_seq .= $starter;
0830                                      $starter = $utf_char;
0831                                  }
0832                              }
0833                              else
0834                              {
0835                                  // No composite, just add the first starter to the sequence then continue with the other one
0836                                  $nf_seq .= $starter;
0837                                  $starter = $utf_char;
0838                              }
0839                          }
0840                          else
0841                          {
0842                              ksort($utf_sort);
0843   
0844                              // For each class of combining characters
0845                              foreach ($utf_sort as $cc => $utf_chars)
0846                              {
0847                                  $j = 0;
0848   
0849                                  do
0850                                  {
0851                                      // Look for a composite
0852                                      if (isset($utf_canonical_comp[$starter . $utf_chars[$j]]))
0853                                      {
0854                                          // Found a composite, replace the starter
0855                                          $starter = $utf_canonical_comp[$starter . $utf_chars[$j]];
0856                                          unset($utf_sort[$cc][$j]);
0857                                      }
0858                                      else
0859                                      {
0860                                          // No composite, all following characters in that class are blocked
0861                                          break;
0862                                      }
0863                                  }
0864                                  while (isset($utf_sort[$cc][++$j]));
0865                              }
0866   
0867                              // Add the starter to the normalized sequence, followed by non-starters in canonical order
0868                              $nf_seq .= $starter;
0869   
0870                              foreach ($utf_sort as $utf_chars)
0871                              {
0872                                  if (!empty($utf_chars))
0873                                  {
0874                                      $nf_seq .= implode('', $utf_chars);
0875                                  }
0876                              }
0877   
0878                              // Reset the array and go on
0879                              $utf_sort = array();
0880                              $starter = $utf_char;
0881                          }
0882                      }
0883                  }
0884                  while ($k <= $k_max);
0885   
0886                  $tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $nf_seq;
0887                  $tmp_pos = $pos;
0888              }
0889              else
0890              {
0891                  // Only a ASCII char can make the program get here
0892                  //
0893                  // First we skip the current byte with ++$pos, then we quickly skip following ASCII chars with strspn().
0894                  //
0895                  // The first two "if"'s here can be removed, with the consequences of being faster on latin text (lots of ASCII) and slower on
0896                  // multi-byte text (where the only ASCII chars are spaces and punctuation)
0897                  if (++$pos != $len)
0898                  {
0899                      if ($str[$pos] < "\x80")
0900                      {
0901                          $pos += strspn($str, UTF8_ASCII_RANGE, ++$pos);
0902                          $buffer[++$i & 7] = $str[$pos - 1];
0903                      }
0904                      else
0905                      {
0906                          $buffer[++$i & 7] = $c;
0907                      }
0908                  }
0909              }
0910          }
0911          while ($pos < $len);
0912   
0913          // Now is time to return the string
0914          if ($tmp_pos)
0915          {
0916              // If the $tmp_pos cursor is not at the beggining of the string then at least one character was not in normal form. Replace $str with the fixed version
0917              if ($tmp_pos == $len)
0918              {
0919                  // The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str
0920                  return $tmp;
0921              }
0922              else
0923              {
0924                  // The rightmost chunk of $str has not been appended to $tmp yet
0925                  return $tmp . substr($str, $tmp_pos);
0926              }
0927          }
0928   
0929          // The string was already in normal form
0930          return $str;
0931      }
0932   
0933      /**
0934      * Decompose a UTF string
0935      *
0936      * @param    string    $str            UTF string
0937      * @param    integer    $pos            Position of the first UTF char (in bytes)
0938      * @param    integer    $len            Length of the string (in bytes)
0939      * @param    array    &$decomp_map    Decomposition mapping, passed by reference but never modified
0940      * @return    string                    The string, decomposed and sorted canonically
0941      *
0942      * @access    private
0943      */
0944      static function decompose($str, $pos, $len, &$decomp_map)
0945      {
0946          global $utf_combining_class;
0947   
0948          // Load some commonly-used tables
0949          if (!isset($utf_combining_class))
0950          {
0951              global $phpbb_root_path, $phpEx;
0952              include($phpbb_root_path . 'includes/utf/data/utf_normalizer_common.' . $phpEx);
0953          }
0954   
0955          // UTF char length array
0956          $utf_len_mask = array(
0957              // Leading bytes masks
0958              "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,
0959              // Trailing bytes masks
0960              "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0
0961          );
0962   
0963          // Some extra checks are triggered on the first byte of a UTF sequence
0964          $extra_check = array(
0965              "\xED" => 1, "\xEF" => 1, "\xC0" => 1, "\xC1" => 1, "\xE0" => 1, "\xF0" => 1,
0966              "\xF4" => 1, "\xF5" => 1, "\xF6" => 1, "\xF7" => 1, "\xF8" => 1, "\xF9" => 1,
0967              "\xFA" => 1, "\xFB" => 1, "\xFC" => 1, "\xFD" => 1, "\xFE" => 1, "\xFF" => 1
0968          );
0969   
0970          // These masks are used to check if a UTF sequence is well formed. Here are the only 3 lengths we acknowledge:
0971          //   - 2-byte: 110? ???? 10?? ????
0972          //   - 3-byte: 1110 ???? 10?? ???? 10?? ????
0973          //   - 4-byte: 1111 0??? 10?? ???? 10?? ???? 10?? ????
0974          // Note that 5- and 6- byte sequences are automatically discarded
0975          $utf_validation_mask = array(
0976              2    => "\xE0\xC0",
0977              3    => "\xF0\xC0\xC0",
0978              4    => "\xF8\xC0\xC0\xC0"
0979          );
0980   
0981          $utf_validation_check = array(
0982              2    => "\xC0\x80",
0983              3    => "\xE0\x80\x80",
0984              4    => "\xF0\x80\x80\x80"
0985          );
0986   
0987          $tmp = '';
0988          $starter_pos = $pos;
0989          $tmp_pos = $last_cc = $sort = $dump = 0;
0990          $utf_sort = array();
0991   
0992          // Main loop
0993          do
0994          {
0995              // STEP 0: Capture the current char
0996   
0997              $cur_mask = $str[$pos] & "\xF0";
0998              if (isset($utf_len_mask[$cur_mask]))
0999              {
1000                  if ($utf_len = $utf_len_mask[$cur_mask])
1001                  {
1002                      // Multibyte char
1003                      $utf_char = substr($str, $pos, $utf_len);
1004                      $pos += $utf_len;
1005                  }
1006                  else
1007                  {
1008                      // A trailing byte came out of nowhere, we will treat it and all following trailing bytes as if each of them was a Unicode
1009                      // replacement char and we will advance the cursor
1010                      $spn = strspn($str, UTF8_TRAILING_BYTES, $pos);
1011   
1012                      if ($dump)
1013                      {
1014                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1015   
1016                          // Dump combiners
1017                          if (!empty($utf_sort))
1018                          {
1019                              if ($sort)
1020                              {
1021                                  ksort($utf_sort);
1022                              }
1023   
1024                              foreach ($utf_sort as $utf_chars)
1025                              {
1026                                  $tmp .= implode('', $utf_chars);
1027                              }
1028                          }
1029   
1030                          $tmp .= str_repeat(UTF8_REPLACEMENT, $spn);
1031                          $dump = $sort = 0;
1032                      }
1033                      else
1034                      {
1035                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn);
1036                      }
1037   
1038                      $pos += $spn;
1039                      $tmp_pos = $starter_pos = $pos;
1040   
1041                      $utf_sort = array();
1042                      $last_cc = 0;
1043   
1044                      continue;
1045                  }
1046   
1047                  // STEP 1: Decide what to do with current char
1048   
1049                  // Now, in that order:
1050                  //  - check if that character is decomposable
1051                  //  - check if that character is a non-starter
1052                  //  - check if that character requires extra checks to be performed
1053                  if (isset($decomp_map[$utf_char]))
1054                  {
1055                      // Decompose the char
1056                      $_pos = 0;
1057                      $_len = strlen($decomp_map[$utf_char]);
1058   
1059                      do
1060                      {
1061                          $c = $decomp_map[$utf_char][$_pos];
1062                          $_utf_len =& $utf_len_mask[$c & "\xF0"];
1063   
1064                          if (isset($_utf_len))
1065                          {
1066                              $_utf_char = substr($decomp_map[$utf_char], $_pos, $_utf_len);
1067                              $_pos += $_utf_len;
1068   
1069                              if (isset($utf_combining_class[$_utf_char]))
1070                              {
1071                                  // The character decomposed to a non-starter, buffer it for sorting
1072                                  $utf_sort[$utf_combining_class[$_utf_char]][] = $_utf_char;
1073   
1074                                  if ($utf_combining_class[$_utf_char] < $last_cc)
1075                                  {
1076                                      // Not canonically ordered, will require sorting
1077                                      $sort = $dump = 1;
1078                                  }
1079                                  else
1080                                  {
1081                                      $dump = 1;
1082                                      $last_cc = $utf_combining_class[$_utf_char];
1083                                  }
1084                              }
1085                              else
1086                              {
1087                                  // This character decomposition contains a starter, dump the buffer and continue
1088                                  if ($dump)
1089                                  {
1090                                      $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1091   
1092                                      // Dump combiners
1093                                      if (!empty($utf_sort))
1094                                      {
1095                                          if ($sort)
1096                                          {
1097                                              ksort($utf_sort);
1098                                          }
1099   
1100                                          foreach ($utf_sort as $utf_chars)
1101                                          {
1102                                              $tmp .= implode('', $utf_chars);
1103                                          }
1104                                      }
1105   
1106                                      $tmp .= $_utf_char;
1107                                      $dump = $sort = 0;
1108                                  }
1109                                  else
1110                                  {
1111                                      $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos) . $_utf_char;
1112                                  }
1113   
1114                                  $tmp_pos = $starter_pos = $pos;
1115                                  $utf_sort = array();
1116                                  $last_cc = 0;
1117                              }
1118                          }
1119                          else
1120                          {
1121                              // This character decomposition contains an ASCII char, which is a starter. Dump the buffer and continue
1122                              ++$_pos;
1123   
1124                              if ($dump)
1125                              {
1126                                  $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1127   
1128                                  // Dump combiners
1129                                  if (!empty($utf_sort))
1130                                  {
1131                                      if ($sort)
1132                                      {
1133                                          ksort($utf_sort);
1134                                      }
1135   
1136                                      foreach ($utf_sort as $utf_chars)
1137                                      {
1138                                          $tmp .= implode('', $utf_chars);
1139                                      }
1140                                  }
1141   
1142                                  $tmp .= $c;
1143                                  $dump = $sort = 0;
1144                              }
1145                              else
1146                              {
1147                                  $tmp .= substr($str, $tmp_pos, $pos - $utf_len - $tmp_pos) . $c;
1148                              }
1149   
1150                              $tmp_pos = $starter_pos = $pos;
1151                              $utf_sort = array();
1152                              $last_cc = 0;
1153                          }
1154                      }
1155                      while ($_pos < $_len);
1156                  }
1157                  else if (isset($utf_combining_class[$utf_char]))
1158                  {
1159                      // Combining character
1160                      if ($utf_combining_class[$utf_char] < $last_cc)
1161                      {
1162                          // Not in canonical order
1163                          $sort = $dump = 1;
1164                      }
1165                      else
1166                      {
1167                          $last_cc = $utf_combining_class[$utf_char];
1168                      }
1169   
1170                      $utf_sort[$utf_combining_class[$utf_char]][] = $utf_char;
1171                  }
1172                  else
1173                  {
1174                      // Non-decomposable starter, check out if it's a Hangul syllable
1175                      if ($utf_char < UTF8_HANGUL_FIRST || $utf_char > UTF8_HANGUL_LAST)
1176                      {
1177                          // Nope, regular UTF char, check that we have the correct number of trailing bytes
1178                          if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len])
1179                          {
1180                              // Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char
1181                              // has been encoded in a five- or six- byte sequence.
1182                              // Move the cursor back to its original position then advance it to the position it should really be at
1183                              $pos -= $utf_len;
1184                              $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1185   
1186                              if (!empty($utf_sort))
1187                              {
1188                                  ksort($utf_sort);
1189   
1190                                  foreach ($utf_sort as $utf_chars)
1191                                  {
1192                                      $tmp .= implode('', $utf_chars);
1193                                  }
1194                                  $utf_sort = array();
1195                              }
1196   
1197                              // Add a replacement char then another replacement char for every trailing byte.
1198                              //
1199                              // @todo I'm not entirely sure that's how we're supposed to mark invalidated byte sequences, check this
1200                              $spn = strspn($str, UTF8_TRAILING_BYTES, ++$pos);
1201                              $tmp .= str_repeat(UTF8_REPLACEMENT, $spn + 1);
1202   
1203                              $dump = $sort = 0;
1204   
1205                              $pos += $spn;
1206                              $tmp_pos = $pos;
1207                              continue;
1208                          }
1209   
1210                          if (isset($extra_check[$utf_char[0]]))
1211                          {
1212                              switch ($utf_char[0])
1213                              {
1214                                  // Note: 0xED is quite common in Korean
1215                                  case "\xED":
1216                                      if ($utf_char >= "\xED\xA0\x80")
1217                                      {
1218                                          // Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF)
1219                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1220   
1221                                          if (!empty($utf_sort))
1222                                          {
1223                                              ksort($utf_sort);
1224   
1225                                              foreach ($utf_sort as $utf_chars)
1226                                              {
1227                                                  $tmp .= implode('', $utf_chars);
1228                                              }
1229                                              $utf_sort = array();
1230                                          }
1231   
1232                                          $tmp .= UTF8_REPLACEMENT;
1233                                          $dump = $sort = 0;
1234   
1235                                          $tmp_pos = $starter_pos = $pos;
1236                                          continue 2;
1237                                      }
1238                                  break;
1239   
1240                                  // Note: 0xEF is quite common in Japanese
1241                                  case "\xEF":
1242                                      if ($utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF")
1243                                      {
1244                                          // U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF)
1245                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1246   
1247                                          if (!empty($utf_sort))
1248                                          {
1249                                              ksort($utf_sort);
1250   
1251                                              foreach ($utf_sort as $utf_chars)
1252                                              {
1253                                                  $tmp .= implode('', $utf_chars);
1254                                              }
1255                                              $utf_sort = array();
1256                                          }
1257   
1258                                          $tmp .= UTF8_REPLACEMENT;
1259                                          $dump = $sort = 0;
1260   
1261                                          $tmp_pos = $starter_pos = $pos;
1262                                          continue 2;
1263                                      }
1264                                  break;
1265   
1266                                  case "\xC0":
1267                                  case "\xC1":
1268                                      if ($utf_char <= "\xC1\xBF")
1269                                      {
1270                                          // Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char
1271                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1272   
1273                                          if (!empty($utf_sort))
1274                                          {
1275                                              ksort($utf_sort);
1276   
1277                                              foreach ($utf_sort as $utf_chars)
1278                                              {
1279                                                  $tmp .= implode('', $utf_chars);
1280                                              }
1281                                              $utf_sort = array();
1282                                          }
1283   
1284                                          $tmp .= UTF8_REPLACEMENT;
1285                                          $dump = $sort = 0;
1286   
1287                                          $tmp_pos = $starter_pos = $pos;
1288                                          continue 2;
1289                                      }
1290                                  break;
1291   
1292                                  case "\xE0":
1293                                      if ($utf_char <= "\xE0\x9F\xBF")
1294                                      {
1295                                          // Unicode char U+0000..U+07FF encoded in 3 bytes
1296                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1297   
1298                                          if (!empty($utf_sort))
1299                                          {
1300                                              ksort($utf_sort);
1301   
1302                                              foreach ($utf_sort as $utf_chars)
1303                                              {
1304                                                  $tmp .= implode('', $utf_chars);
1305                                              }
1306                                              $utf_sort = array();
1307                                          }
1308   
1309                                          $tmp .= UTF8_REPLACEMENT;
1310                                          $dump = $sort = 0;
1311   
1312                                          $tmp_pos = $starter_pos = $pos;
1313                                          continue 2;
1314                                      }
1315                                  break;
1316   
1317                                  case "\xF0":
1318                                      if ($utf_char <= "\xF0\x8F\xBF\xBF")
1319                                      {
1320                                          // Unicode char U+0000..U+FFFF encoded in 4 bytes
1321                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1322   
1323                                          if (!empty($utf_sort))
1324                                          {
1325                                              ksort($utf_sort);
1326   
1327                                              foreach ($utf_sort as $utf_chars)
1328                                              {
1329                                                  $tmp .= implode('', $utf_chars);
1330                                              }
1331                                              $utf_sort = array();
1332                                          }
1333   
1334                                          $tmp .= UTF8_REPLACEMENT;
1335                                          $dump = $sort = 0;
1336   
1337                                          $tmp_pos = $starter_pos = $pos;
1338                                          continue 2;
1339                                      }
1340                                  break;
1341   
1342                                  default:
1343                                      if ($utf_char > UTF8_MAX)
1344                                      {
1345                                          // Out of the Unicode range
1346                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1347   
1348                                          if (!empty($utf_sort))
1349                                          {
1350                                              ksort($utf_sort);
1351   
1352                                              foreach ($utf_sort as $utf_chars)
1353                                              {
1354                                                  $tmp .= implode('', $utf_chars);
1355                                              }
1356                                              $utf_sort = array();
1357                                          }
1358   
1359                                          $tmp .= UTF8_REPLACEMENT;
1360                                          $dump = $sort = 0;
1361   
1362                                          $tmp_pos = $starter_pos = $pos;
1363                                          continue 2;
1364                                      }
1365                                  break;
1366                              }
1367                          }
1368                      }
1369                      else
1370                      {
1371                          // Hangul syllable
1372                          $idx = (((ord($utf_char[0]) & 0x0F) << 12) | ((ord($utf_char[1]) & 0x3F) << 6) | (ord($utf_char[2]) & 0x3F)) - UNICODE_HANGUL_SBASE;
1373   
1374                          // LIndex can only range from 0 to 18, therefore it cannot influence the first two bytes of the L Jamo, which allows us to hardcode them (based on LBase).
1375                          //
1376                          // The same goes for VIndex, but for TIndex there's a catch: the value of the third byte could exceed 0xBF and we would have to increment the second byte
1377                          if ($t_index = $idx % UNICODE_HANGUL_TCOUNT)
1378                          {
1379                              if ($t_index < 25)
1380                              {
1381                                  $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x86\x00";
1382                                  $utf_char[8] = chr(0xA7 + $t_index);
1383                              }
1384                              else
1385                              {
1386                                  $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x87\x00";
1387                                  $utf_char[8] = chr(0x67 + $t_index);
1388                              }
1389                          }
1390                          else
1391                          {
1392                              $utf_char = "\xE1\x84\x00\xE1\x85\x00";
1393                          }
1394   
1395                          $utf_char[2] = chr(0x80 + (int) ($idx / UNICODE_HANGUL_NCOUNT));
1396                          $utf_char[5] = chr(0xA1 + (int) (($idx % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT));
1397   
1398                          // Just like other decompositions, the resulting Jamos must be dumped to the tmp string
1399                          $dump = 1;
1400                      }
1401   
1402                      // Do we need to dump stuff to the tmp string?
1403                      if ($dump)
1404                      {
1405                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1406   
1407                          // Dump combiners
1408                          if (!empty($utf_sort))
1409                          {
1410                              if ($sort)
1411                              {
1412                                  ksort($utf_sort);
1413                              }
1414   
1415                              foreach ($utf_sort as $utf_chars)
1416                              {
1417                                  $tmp .= implode('', $utf_chars);
1418                              }
1419                          }
1420   
1421                          $tmp .= $utf_char;
1422                          $dump = $sort = 0;
1423                          $tmp_pos = $pos;
1424                      }
1425   
1426                      $last_cc = 0;
1427                      $utf_sort = array();
1428                      $starter_pos = $pos;
1429                  }
1430              }
1431              else
1432              {
1433                  // ASCII char, which happens to be a starter (as any other ASCII char)
1434                  if ($dump)
1435                  {
1436                      $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1437   
1438                      // Dump combiners
1439                      if (!empty($utf_sort))
1440                      {
1441                          if ($sort)
1442                          {
1443                              ksort($utf_sort);
1444                          }
1445   
1446                          foreach ($utf_sort as $utf_chars)
1447                          {
1448                              $tmp .= implode('', $utf_chars);
1449                          }
1450                      }
1451   
1452                      $tmp .= $str[$pos];
1453                      $dump = $sort = 0;
1454                      $tmp_pos = ++$pos;
1455   
1456                      $pos += strspn($str, UTF8_ASCII_RANGE, $pos);
1457                  }
1458                  else
1459                  {
1460                      $pos += strspn($str, UTF8_ASCII_RANGE, ++$pos);
1461                  }
1462   
1463                  $last_cc = 0;
1464                  $utf_sort = array();
1465                  $starter_pos = $pos;
1466              }
1467          }
1468          while ($pos < $len);
1469   
1470          // Now is time to return the string
1471          if ($dump)
1472          {
1473              $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1474   
1475              // Dump combiners
1476              if (!empty($utf_sort))
1477              {
1478                  if ($sort)
1479                  {
1480                      ksort($utf_sort);
1481                  }
1482   
1483                  foreach ($utf_sort as $utf_chars)
1484                  {
1485                      $tmp .= implode('', $utf_chars);
1486                  }
1487              }
1488   
1489              return $tmp;
1490          }
1491          else if ($tmp_pos)
1492          {
1493              // If the $tmp_pos cursor was moved then at least one character was not in normal form. Replace $str with the fixed version
1494              if ($tmp_pos == $len)
1495              {
1496                  // The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str
1497                  return $tmp;
1498              }
1499              else
1500              {
1501                  // The rightmost chunk of $str has not been appended to $tmp yet
1502                  return $tmp . substr($str, $tmp_pos);
1503              }
1504          }
1505   
1506          // The string was already in normal form
1507          return $str;
1508      }
1509  }
1510