Verzeichnisstruktur phpBB-3.0.0


Veröffentlicht
12.12.2007

So funktioniert es


Auf das letzte Element klicken. Dies geht jeweils ein Schritt zurück

Auf das Icon klicken, dies öffnet das Verzeichnis. Nochmal klicken schließt das Verzeichnis.
Auf den Verzeichnisnamen klicken, dies zeigt nur das Verzeichnis mit Inhalt an

(Beispiel Datei-Icons)

Auf das Icon klicken um den Quellcode anzuzeigen

utf_normalizer.php

Zuletzt modifiziert: 09.10.2024, 12:51 - Dateigröße: 41.73 KiB


0001  <?php
0002  /**
0003  *
0004  * @package utf
0005  * @version $Id$
0006  * @copyright (c) 2005 phpBB Group
0007  * @license http://opensource.org/licenses/gpl-license.php GNU Public License
0008  *
0009  */
0010   
0011  /**
0012  */
0013  if (!defined('IN_PHPBB'))
0014  {
0015      exit;
0016  }
0017   
0018  /**
0019  * Some Unicode characters encoded in UTF-8
0020  *
0021  * Preserved for compatibility
0022  */
0023  define('UTF8_REPLACEMENT', "\xEF\xBF\xBD");
0024  define('UTF8_MAX', "\xF4\x8F\xBF\xBF");
0025  define('UTF8_FFFE', "\xEF\xBF\xBE");
0026  define('UTF8_FFFF', "\xEF\xBF\xBF");
0027  define('UTF8_SURROGATE_FIRST', "\xED\xA0\x80");
0028  define('UTF8_SURROGATE_LAST', "\xED\xBF\xBF");
0029  define('UTF8_HANGUL_FIRST', "\xEA\xB0\x80");
0030  define('UTF8_HANGUL_LAST', "\xED\x9E\xA3");
0031   
0032  define('UTF8_CJK_FIRST', "\xE4\xB8\x80");
0033  define('UTF8_CJK_LAST', "\xE9\xBE\xBB");
0034  define('UTF8_CJK_B_FIRST', "\xF0\xA0\x80\x80");
0035  define('UTF8_CJK_B_LAST', "\xF0\xAA\x9B\x96");
0036   
0037  // Unset global variables
0038  unset($GLOBALS['utf_jamo_index'], $GLOBALS['utf_jamo_type'], $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_combining_class'], $GLOBALS['utf_canonical_comp'], $GLOBALS['utf_canonical_decomp'], $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);
0039   
0040  // NFC_QC and NFKC_QC values
0041  define('UNICODE_QC_MAYBE', 0);
0042  define('UNICODE_QC_NO', 1);
0043   
0044  // Contains all the ASCII characters appearing in UTF-8, sorted by frequency
0045  define('UTF8_ASCII_RANGE', "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F");
0046   
0047  // Contains all the tail bytes that can appear in the composition of a UTF-8 char
0048  define('UTF8_TRAILING_BYTES', "\xA9\xA0\xA8\x80\xAA\x99\xA7\xBB\xAB\x89\x94\x82\xB4\xA2\xAE\x83\xB0\xB9\xB8\x93\xAF\xBC\xB3\x81\xA4\xB2\x9C\xA1\xB5\xBE\xBD\xBA\x98\xAD\xB1\x84\x95\xA6\xB6\x88\x8D\x90\xB7\xBF\x92\x85\xA5\x97\x8C\x86\xA3\x8E\x9F\x8F\x87\x91\x9D\xAC\x9E\x8B\x96\x9B\x8A\x9A");
0049   
0050  // Constants used by the Hangul [de]composition algorithms
0051  define('UNICODE_HANGUL_SBASE', 0xAC00);
0052  define('UNICODE_HANGUL_LBASE', 0x1100);
0053  define('UNICODE_HANGUL_VBASE', 0x1161);
0054  define('UNICODE_HANGUL_TBASE', 0x11A7);
0055  define('UNICODE_HANGUL_SCOUNT', 11172);
0056  define('UNICODE_HANGUL_LCOUNT', 19);
0057  define('UNICODE_HANGUL_VCOUNT', 21);
0058  define('UNICODE_HANGUL_TCOUNT', 28);
0059  define('UNICODE_HANGUL_NCOUNT', 588);
0060  define('UNICODE_JAMO_L', 0);
0061  define('UNICODE_JAMO_V', 1);
0062  define('UNICODE_JAMO_T', 2);
0063   
0064  /**
0065  * Unicode normalization routines
0066  *
0067  * @package utf
0068  */
0069  class utf_normalizer
0070  {
0071      /**
0072      * Validate, cleanup and normalize a string
0073      *
0074      * The ultimate convenience function! Clean up invalid UTF-8 sequences,
0075      * and convert to Normal Form C, canonical composition.
0076      *
0077      * @param    string    &$str    The dirty string
0078      * @return    string            The same string, all shiny and cleaned-up
0079      */
0080      function cleanup(&$str)
0081      {
0082          // The string below is the list of all autorized characters, sorted by frequency in latin text
0083          $pos = strspn($str, "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x0D");
0084          $len = strlen($str);
0085   
0086          if ($pos == $len)
0087          {
0088              // ASCII strings with no special chars return immediately
0089              return;
0090          }
0091   
0092          // Note: we do not check for $GLOBALS['utf_canonical_decomp']. It is assumed they are always loaded together
0093          if (!isset($GLOBALS['utf_nfc_qc']))
0094          {
0095              global $phpbb_root_path, $phpEx;
0096              include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);
0097          }
0098   
0099          if (!isset($GLOBALS['utf_canonical_decomp']))
0100          {
0101              global $phpbb_root_path, $phpEx;
0102              include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
0103          }
0104   
0105          // Replace any byte in the range 0x00..0x1F, except for \r, \n and \t
0106          // We replace those characters with a 0xFF byte, which is illegal in UTF-8 and will in turn be replaced with a UTF replacement char
0107          $str = strtr(
0108              $str,
0109              "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",
0110              "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"
0111          );
0112   
0113          $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
0114      }
0115   
0116      /**
0117      * Validate and normalize a UTF string to NFC
0118      *
0119      * @param    string    &$str    Unchecked UTF string
0120      * @return    string            The string, validated and in normal form
0121      */
0122      function nfc(&$str)
0123      {
0124          $pos = strspn($str, UTF8_ASCII_RANGE);
0125          $len = strlen($str);
0126   
0127          if ($pos == $len)
0128          {
0129              // ASCII strings return immediately
0130              return;
0131          }
0132   
0133          if (!isset($GLOBALS['utf_nfc_qc']))
0134          {
0135              global $phpbb_root_path, $phpEx;
0136              include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);
0137          }
0138   
0139          if (!isset($GLOBALS['utf_canonical_decomp']))
0140          {
0141              global $phpbb_root_path, $phpEx;
0142              include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
0143          }
0144   
0145          $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
0146      }
0147   
0148      /**
0149      * Validate and normalize a UTF string to NFKC
0150      *
0151      * @param    string    &$str    Unchecked UTF string
0152      * @return    string            The string, validated and in normal form
0153      */
0154      function nfkc(&$str)
0155      {
0156          $pos = strspn($str, UTF8_ASCII_RANGE);
0157          $len = strlen($str);
0158   
0159          if ($pos == $len)
0160          {
0161              // ASCII strings return immediately
0162              return;
0163          }
0164   
0165          if (!isset($GLOBALS['utf_nfkc_qc']))
0166          {
0167              global $phpbb_root_path, $phpEx;
0168              include($phpbb_root_path . 'includes/utf/data/utf_nfkc_qc.' . $phpEx);
0169          }
0170   
0171          if (!isset($GLOBALS['utf_compatibility_decomp']))
0172          {
0173              global $phpbb_root_path, $phpEx;
0174              include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);
0175          }
0176   
0177          $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);
0178      }
0179   
0180      /**
0181      * Validate and normalize a UTF string to NFD
0182      *
0183      * @param    string    &$str    Unchecked UTF string
0184      * @return    string            The string, validated and in normal form
0185      */
0186      function nfd(&$str)
0187      {
0188          $pos = strspn($str, UTF8_ASCII_RANGE);
0189          $len = strlen($str);
0190   
0191          if ($pos == $len)
0192          {
0193              // ASCII strings return immediately
0194              return;
0195          }
0196   
0197          if (!isset($GLOBALS['utf_canonical_decomp']))
0198          {
0199              global $phpbb_root_path, $phpEx;
0200              include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
0201          }
0202   
0203          $str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_canonical_decomp']);
0204      }
0205   
0206      /**
0207      * Validate and normalize a UTF string to NFKD
0208      *
0209      * @param    string    &$str    Unchecked UTF string
0210      * @return    string            The string, validated and in normal form
0211      */
0212      function nfkd(&$str)
0213      {
0214          $pos = strspn($str, UTF8_ASCII_RANGE);
0215          $len = strlen($str);
0216   
0217          if ($pos == $len)
0218          {
0219              // ASCII strings return immediately
0220              return;
0221          }
0222   
0223          if (!isset($GLOBALS['utf_compatibility_decomp']))
0224          {
0225              global $phpbb_root_path, $phpEx;
0226              include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);
0227          }
0228   
0229          $str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_compatibility_decomp']);
0230      }
0231   
0232   
0233      /**
0234      * Recompose a UTF string
0235      *
0236      * @param    string    $str            Unchecked UTF string
0237      * @param    integer    $pos            Position of the first UTF char (in bytes)
0238      * @param    integer    $len            Length of the string (in bytes)
0239      * @param    array    &$qc            Quick-check array, passed by reference but never modified
0240      * @param    array    &$decomp_map    Decomposition mapping, passed by reference but never modified
0241      * @return    string                    The string, validated and recomposed
0242      *
0243      * @access    private
0244      */
0245      function recompose($str, $pos, $len, &$qc, &$decomp_map)
0246      {
0247          global $utf_combining_class, $utf_canonical_comp, $utf_jamo_type, $utf_jamo_index;
0248   
0249          // Load some commonly-used tables
0250          if (!isset($utf_jamo_index, $utf_jamo_type, $utf_combining_class))
0251          {
0252              global $phpbb_root_path, $phpEx;
0253              include($phpbb_root_path . 'includes/utf/data/utf_normalizer_common.' . $phpEx);
0254          }
0255   
0256          // Load the canonical composition table
0257          if (!isset($utf_canonical_comp))
0258          {
0259              global $phpbb_root_path, $phpEx;
0260              include($phpbb_root_path . 'includes/utf/data/utf_canonical_comp.' . $phpEx);
0261          }
0262   
0263          // Buffer the last ASCII char before the UTF-8 stuff if applicable
0264          $tmp = '';
0265          $i = $tmp_pos = $last_cc = 0;
0266   
0267          $buffer = ($pos) ? array(++$i => $str[$pos - 1]) : array();
0268   
0269          // UTF char length array
0270          // This array is used to determine the length of a UTF character.
0271          // Be $c the result of ($str[$pos] & "\xF0") --where $str is the string we're operating on and $pos
0272          // the position of the cursor--, if $utf_len_mask[$c] does not exist, the byte is an ASCII char.
0273          // Otherwise, if $utf_len_mask[$c] is greater than 0, we have a the leading byte of a multibyte character
0274          // whose length is $utf_len_mask[$c] and if it is equal to 0, the byte is a trailing byte.
0275          $utf_len_mask = array(
0276              // Leading bytes masks
0277              "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,
0278              // Trailing bytes masks
0279              "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0
0280          );
0281   
0282          $extra_check = array(
0283              "\xED" => 1, "\xEF" => 1, "\xC0" => 1, "\xC1" => 1, "\xE0" => 1, "\xF0" => 1,
0284              "\xF4" => 1, "\xF5" => 1, "\xF6" => 1, "\xF7" => 1, "\xF8" => 1, "\xF9" => 1,
0285              "\xFA" => 1, "\xFB" => 1, "\xFC" => 1, "\xFD" => 1, "\xFE" => 1, "\xFF" => 1
0286          );
0287   
0288          $utf_validation_mask = array(
0289              2    => "\xE0\xC0",
0290              3    => "\xF0\xC0\xC0",
0291              4    => "\xF8\xC0\xC0\xC0"
0292          );
0293   
0294          $utf_validation_check = array(
0295              2    => "\xC0\x80",
0296              3    => "\xE0\x80\x80",
0297              4    => "\xF0\x80\x80\x80"
0298          );
0299   
0300          // Main loop
0301          do
0302          {
0303              // STEP 0: Capture the current char and buffer it
0304              $c = $str[$pos];
0305              $c_mask = $c & "\xF0";
0306   
0307              if (isset($utf_len_mask[$c_mask]))
0308              {
0309                  // Byte at $pos is either a leading byte or a missplaced trailing byte
0310                  if ($utf_len = $utf_len_mask[$c_mask])
0311                  {
0312                      // Capture the char
0313                      $buffer[++$i & 7] = $utf_char = substr($str, $pos, $utf_len);
0314   
0315                      // Let's find out if a thorough check is needed
0316                      if (isset($qc[$utf_char]))
0317                      {
0318                          // If the UTF char is in the qc array then it may not be in normal form. We do nothing here, the actual processing is below this "if" block
0319                      }
0320                      else if (isset($utf_combining_class[$utf_char]))
0321                      {
0322                          if ($utf_combining_class[$utf_char] < $last_cc)
0323                          {
0324                              // A combining character that is NOT canonically ordered
0325                          }
0326                          else
0327                          {
0328                              // A combining character that IS canonically ordered, skip to the next char
0329                              $last_cc = $utf_combining_class[$utf_char];
0330   
0331                              $pos += $utf_len;
0332                              continue;
0333                          }
0334                      }
0335                      else
0336                      {
0337                          // At this point, $utf_char holds a UTF char that we know is not a NF[K]C_QC and is not a combining character.
0338                          // It can be a singleton, a canonical composite, a replacement char or an even an ill-formed bunch of bytes. Let's find out
0339                          $last_cc = 0;
0340   
0341                          // Check that we have the correct number of trailing bytes
0342                          if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len])
0343                          {
0344                              // Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char
0345                              // has been encoded in a five- or six- byte sequence
0346                              if ($utf_char[0] >= "\xF8")
0347                              {
0348                                  if ($utf_char[0] < "\xFC")
0349                                  {
0350                                      $trailing_bytes = 4;
0351                                  }
0352                                  else if ($utf_char[0] > "\xFD")
0353                                  {
0354                                      $trailing_bytes = 0;
0355                                  }
0356                                  else
0357                                  {
0358                                      $trailing_bytes = 5;
0359                                  }
0360                              }
0361                              else
0362                              {
0363                                  $trailing_bytes = $utf_len - 1;
0364                              }
0365   
0366                              $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
0367                              $pos += strspn($str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes);
0368                              $tmp_pos = $pos;
0369   
0370                              continue;
0371                          }
0372   
0373                          if (isset($extra_check[$c]))
0374                          {
0375                              switch ($c)
0376                              {
0377                                  // Note: 0xED is quite common in Korean
0378                                  case "\xED":
0379                                      if ($utf_char >= "\xED\xA0\x80")
0380                                      {
0381                                          // Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF)
0382                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
0383                                          $pos += $utf_len;
0384                                          $tmp_pos = $pos;
0385                                          continue 2;
0386                                      }
0387                                  break;
0388   
0389                                  // Note: 0xEF is quite common in Japanese
0390                                  case "\xEF":
0391                                      if ($utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF")
0392                                      {
0393                                          // U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF)
0394                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
0395                                          $pos += $utf_len;
0396                                          $tmp_pos = $pos;
0397                                          continue 2;
0398                                      }
0399                                  break;
0400   
0401                                  case "\xC0":
0402                                  case "\xC1":
0403                                      if ($utf_char <= "\xC1\xBF")
0404                                      {
0405                                          // Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char
0406                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
0407                                          $pos += $utf_len;
0408                                          $tmp_pos = $pos;
0409                                          continue 2;
0410                                      }
0411                                  break;
0412   
0413                                  case "\xE0":
0414                                      if ($utf_char <= "\xE0\x9F\xBF")
0415                                      {
0416                                          // Unicode char U+0000..U+07FF encoded in 3 bytes
0417                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
0418                                          $pos += $utf_len;
0419                                          $tmp_pos = $pos;
0420                                          continue 2;
0421                                      }
0422                                  break;
0423   
0424                                  case "\xF0":
0425                                      if ($utf_char <= "\xF0\x8F\xBF\xBF")
0426                                      {
0427                                          // Unicode char U+0000..U+FFFF encoded in 4 bytes
0428                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
0429                                          $pos += $utf_len;
0430                                          $tmp_pos = $pos;
0431                                          continue 2;
0432                                      }
0433                                  break;
0434   
0435                                  default:
0436                                      // Five- and six- byte sequences do not need being checked for here anymore
0437                                      if ($utf_char > UTF8_MAX)
0438                                      {
0439                                          // Out of the Unicode range
0440                                          if ($utf_char[0] < "\xF8")
0441                                          {
0442                                              $trailing_bytes = 3;
0443                                          }
0444                                          else if ($utf_char[0] < "\xFC")
0445                                          {
0446                                              $trailing_bytes = 4;
0447                                          }
0448                                          else if ($utf_char[0] > "\xFD")
0449                                          {
0450                                              $trailing_bytes = 0;
0451                                          }
0452                                          else
0453                                          {
0454                                              $trailing_bytes = 5;
0455                                          }
0456   
0457                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;
0458                                          $pos += strspn($str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes);
0459                                          $tmp_pos = $pos;
0460                                          continue 2;
0461                                      }
0462                                  break;
0463                              }
0464                          }
0465   
0466                          // The char is a valid starter, move the cursor and go on
0467                          $pos += $utf_len;
0468                          continue;
0469                      }
0470                  }
0471                  else
0472                  {
0473                      // A trailing byte came out of nowhere, we will advance the cursor and treat the this byte and all following trailing bytes as if
0474                      // each of them was a Unicode replacement char
0475                      $spn = strspn($str, UTF8_TRAILING_BYTES, $pos);
0476                      $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn);
0477   
0478                      $pos += $spn;
0479                      $tmp_pos = $pos;
0480                      continue;
0481                  }
0482   
0483   
0484                  // STEP 1: Decompose current char
0485   
0486                  // We have found a character that is either:
0487                  //  - in the NFC_QC/NFKC_QC list
0488                  //  - a non-starter char that is not canonically ordered
0489                  //
0490                  // We are going to capture the shortest UTF sequence that satisfies these two conditions:
0491                  //
0492                  //  1 - If the sequence does not start at the begginning of the string, it must begin with a starter,
0493                  // and that starter must not have the NF[K]C_QC property equal to "MAYBE"
0494                  //
0495                  //  2 - If the sequence does not end at the end of the string, it must end with a non-starter and be
0496                  // immediately followed by a starter that is not on the QC list
0497                  //
0498                  $utf_seq = array();
0499                  $last_cc = 0;
0500                  $lpos = $pos;
0501                  $pos += $utf_len;
0502   
0503                  if (isset($decomp_map[$utf_char]))
0504                  {
0505                      $_pos = 0;
0506                      $_len = strlen($decomp_map[$utf_char]);
0507   
0508                      do
0509                      {
0510                          $_utf_len =& $utf_len_mask[$decomp_map[$utf_char][$_pos] & "\xF0"];
0511   
0512                          if (isset($_utf_len))
0513                          {
0514                              $utf_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
0515                              $_pos += $_utf_len;
0516                          }
0517                          else
0518                          {
0519                              $utf_seq[] = $decomp_map[$utf_char][$_pos];
0520                              ++$_pos;
0521                          }
0522                      }
0523                      while ($_pos < $_len);
0524                  }
0525                  else
0526                  {
0527                      // The char is not decomposable
0528                      $utf_seq = array($utf_char);
0529                  }
0530   
0531   
0532                  // STEP 2: Capture the starter
0533   
0534                  // Check out the combining class of the first character of the UTF sequence
0535                  $k = 0;
0536                  if (isset($utf_combining_class[$utf_seq[0]]) || $qc[$utf_char] == UNICODE_QC_MAYBE)
0537                  {
0538                      // Not a starter, inspect previous characters
0539                      // The last 8 characters are kept in a buffer so that we don't have to capture them everytime.
0540                      // This is enough for all real-life strings but even if it wasn't, we can capture characters in backward mode,
0541                      // although it is slower than this method.
0542                      //
0543                      // In the following loop, $j starts at the previous buffered character ($i - 1, because current character is
0544                      // at offset $i) and process them in backward mode until we find a starter.
0545                      //
0546                      // $k is the index on each UTF character inside of our UTF sequence. At this time, $utf_seq contains one or more
0547                      // characters numbered 0 to n. $k starts at 0 and for each char we prepend we pre-decrement it and for numbering
0548                      $starter_found = 0;
0549                      $j_min = max(1, $i - 7);
0550   
0551                      for ($j = $i - 1; $j >= $j_min && $lpos > $tmp_pos; --$j)
0552                      {
0553                          $utf_char = $buffer[$j & 7];
0554                          $lpos -= strlen($utf_char);
0555   
0556                          if (isset($decomp_map[$utf_char]))
0557                          {
0558                              // The char is a composite, decompose for storage
0559                              $decomp_seq = array();
0560                              $_pos = 0;
0561                              $_len = strlen($decomp_map[$utf_char]);
0562   
0563                              do
0564                              {
0565                                  $c = $decomp_map[$utf_char][$_pos];
0566                                  $_utf_len =& $utf_len_mask[$c & "\xF0"];
0567   
0568                                  if (isset($_utf_len))
0569                                  {
0570                                      $decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
0571                                      $_pos += $_utf_len;
0572                                  }
0573                                  else
0574                                  {
0575                                      $decomp_seq[] = $c;
0576                                      ++$_pos;
0577                                  }
0578                              }
0579                              while ($_pos < $_len);
0580   
0581                              // Prepend the UTF sequence with our decomposed sequence
0582                              if (isset($decomp_seq[1]))
0583                              {
0584                                  // The char expanded into several chars
0585                                  $decomp_cnt = sizeof($decomp_seq);
0586   
0587                                  foreach ($decomp_seq as $decomp_i => $decomp_char)
0588                                  {
0589                                      $utf_seq[$k + $decomp_i - $decomp_cnt] = $decomp_char;
0590                                  }
0591                                  $k -= $decomp_cnt;
0592                              }
0593                              else
0594                              {
0595                                  // Decomposed to a single char, easier to prepend
0596                                  $utf_seq[--$k] = $decomp_seq[0];
0597                              }
0598                          }
0599                          else
0600                          {
0601                              $utf_seq[--$k] = $utf_char;
0602                          }
0603   
0604                          if (!isset($utf_combining_class[$utf_seq[$k]]))
0605                          {
0606                              // We have found our starter
0607                              $starter_found = 1;
0608                              break;
0609                          }
0610                      }
0611   
0612                      if (!$starter_found && $lpos > $tmp_pos)
0613                      {
0614                          // The starter was not found in the buffer, let's rewind some more
0615                          do
0616                          {
0617                              // $utf_len_mask contains the masks of both leading bytes and trailing bytes. If $utf_en > 0 then it's a leading byte, otherwise it's a trailing byte.
0618                              $c = $str[--$lpos];
0619                              $c_mask = $c & "\xF0";
0620   
0621                              if (isset($utf_len_mask[$c_mask]))
0622                              {
0623                                  // UTF byte
0624                                  if ($utf_len = $utf_len_mask[$c_mask])
0625                                  {
0626                                      // UTF *leading* byte
0627                                      $utf_char = substr($str, $lpos, $utf_len);
0628   
0629                                      if (isset($decomp_map[$utf_char]))
0630                                      {
0631                                          // Decompose the character
0632                                          $decomp_seq = array();
0633                                          $_pos = 0;
0634                                          $_len = strlen($decomp_map[$utf_char]);
0635   
0636                                          do
0637                                          {
0638                                              $c = $decomp_map[$utf_char][$_pos];
0639                                              $_utf_len =& $utf_len_mask[$c & "\xF0"];
0640   
0641                                              if (isset($_utf_len))
0642                                              {
0643                                                  $decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
0644                                                  $_pos += $_utf_len;
0645                                              }
0646                                              else
0647                                              {
0648                                                  $decomp_seq[] = $c;
0649                                                  ++$_pos;
0650                                              }
0651                                          }
0652                                          while ($_pos < $_len);
0653   
0654                                          // Prepend the UTF sequence with our decomposed sequence
0655                                          if (isset($decomp_seq[1]))
0656                                          {
0657                                              // The char expanded into several chars
0658                                              $decomp_cnt = sizeof($decomp_seq);
0659                                              foreach ($decomp_seq as $decomp_i => $utf_char)
0660                                              {
0661                                                  $utf_seq[$k + $decomp_i - $decomp_cnt] = $utf_char;
0662                                              }
0663                                              $k -= $decomp_cnt;
0664                                          }
0665                                          else
0666                                          {
0667                                              // Decomposed to a single char, easier to prepend
0668                                              $utf_seq[--$k] = $decomp_seq[0];
0669                                          }
0670                                      }
0671                                      else
0672                                      {
0673                                          $utf_seq[--$k] = $utf_char;
0674                                      }
0675                                  }
0676                              }
0677                              else
0678                              {
0679                                  // ASCII char
0680                                  $utf_seq[--$k] = $c;
0681                              }
0682                          }
0683                          while ($lpos > $tmp_pos);
0684                      }
0685                  }
0686   
0687   
0688                  // STEP 3: Capture following combining modifiers
0689   
0690                  while ($pos < $len)
0691                  {
0692                      $c_mask = $str[$pos] & "\xF0";
0693   
0694                      if (isset($utf_len_mask[$c_mask]))
0695                      {
0696                          if ($utf_len = $utf_len_mask[$c_mask])
0697                          {
0698                              $utf_char = substr($str, $pos, $utf_len);
0699                          }
0700                          else
0701                          {
0702                              // A trailing byte came out of nowhere
0703                              // Trailing bytes are replaced with Unicode replacement chars, we will just ignore it for now, break out of the loop
0704                              // as if it was a starter (replacement chars ARE starters) and let the next loop replace it
0705                              break;
0706                          }
0707   
0708                          if (isset($utf_combining_class[$utf_char]) || isset($qc[$utf_char]))
0709                          {
0710                              // Combining character, add it to the sequence and move the cursor
0711                              if (isset($decomp_map[$utf_char]))
0712                              {
0713                                  // Decompose the character
0714                                  $_pos = 0;
0715                                  $_len = strlen($decomp_map[$utf_char]);
0716   
0717                                  do
0718                                  {
0719                                      $c = $decomp_map[$utf_char][$_pos];
0720                                      $_utf_len =& $utf_len_mask[$c & "\xF0"];
0721   
0722                                      if (isset($_utf_len))
0723                                      {
0724                                          $utf_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);
0725                                          $_pos += $_utf_len;
0726                                      }
0727                                      else
0728                                      {
0729                                          $utf_seq[] = $c;
0730                                          ++$_pos;
0731                                      }
0732                                  }
0733                                  while ($_pos < $_len);
0734                              }
0735                              else
0736                              {
0737                                  $utf_seq[] = $utf_char;
0738                              }
0739   
0740                              $pos += $utf_len;
0741                          }
0742                          else
0743                          {
0744                              // Combining class 0 and no QC, break out of the loop
0745                              // Note: we do not know if that character is valid. If it's not, the next iteration will replace it
0746                              break;
0747                          }
0748                      }
0749                      else
0750                      {
0751                          // ASCII chars are starters
0752                          break;
0753                      }
0754                  }
0755   
0756   
0757                  // STEP 4: Sort and combine
0758   
0759                  // Here we sort...
0760                  $k_max = $k + sizeof($utf_seq);
0761   
0762                  if (!$k && $k_max == 1)
0763                  {
0764                      // There is only one char in the UTF sequence, add it then jump to the next iteration of main loop
0765                          // Note: the two commented lines below can be enabled under PHP5 for a very small performance gain in most cases
0766  //                        if (substr_compare($str, $utf_seq[0], $lpos, $pos - $lpos))
0767  //                        {
0768                          $tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $utf_seq[0];
0769                          $tmp_pos = $pos;
0770  //                        }
0771   
0772                      continue;
0773                  }
0774   
0775                  // ...there we combine
0776                  if (isset($utf_combining_class[$utf_seq[$k]]))
0777                  {
0778                      $starter = $nf_seq = '';
0779                  }
0780                  else
0781                  {
0782                      $starter = $utf_seq[$k++];
0783                      $nf_seq = '';
0784                  }
0785                  $utf_sort = array();
0786   
0787                  // We add an empty char at the end of the UTF char sequence. It will act as a starter and trigger the sort/combine routine
0788                  // at the end of the string without altering it
0789                  $utf_seq[] = '';
0790   
0791                  do
0792                  {
0793                      $utf_char = $utf_seq[$k++];
0794   
0795                      if (isset($utf_combining_class[$utf_char]))
0796                      {
0797                          $utf_sort[$utf_combining_class[$utf_char]][] = $utf_char;
0798                      }
0799                      else
0800                      {
0801                          if (empty($utf_sort))
0802                          {
0803                              // No combining characters... check for a composite of the two starters
0804                              if (isset($utf_canonical_comp[$starter . $utf_char]))
0805                              {
0806                                  // Good ol' composite character
0807                                  $starter = $utf_canonical_comp[$starter . $utf_char];
0808                              }
0809                              else if (isset($utf_jamo_type[$utf_char]))
0810                              {
0811                                  // Current char is a composable jamo
0812                                  if (isset($utf_jamo_type[$starter]) && $utf_jamo_type[$starter] == UNICODE_JAMO_L && $utf_jamo_type[$utf_char] == UNICODE_JAMO_V)
0813                                  {
0814                                      // We have a L jamo followed by a V jamo, we are going to prefetch the next char to see if it's a T jamo
0815                                      if (isset($utf_jamo_type[$utf_seq[$k]]) && $utf_jamo_type[$utf_seq[$k]] == UNICODE_JAMO_T)
0816                                      {
0817                                          // L+V+T jamos, combine to a LVT Hangul syllable ($k is incremented)
0818                                          $cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char] + $utf_jamo_index[$utf_seq[$k]];
0819                                          ++$k;
0820                                      }
0821                                      else
0822                                      {
0823                                          // L+V jamos, combine to a LV Hangul syllable
0824                                          $cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char];
0825                                      }
0826   
0827                                      $starter = chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
0828                                  }
0829                                  else
0830                                  {
0831                                      // Non-composable jamo, just add it to the sequence
0832                                      $nf_seq .= $starter;
0833                                      $starter = $utf_char;
0834                                  }
0835                              }
0836                              else
0837                              {
0838                                  // No composite, just add the first starter to the sequence then continue with the other one
0839                                  $nf_seq .= $starter;
0840                                  $starter = $utf_char;
0841                              }
0842                          }
0843                          else
0844                          {
0845                              ksort($utf_sort);
0846   
0847                              // For each class of combining characters
0848                              foreach ($utf_sort as $cc => $utf_chars)
0849                              {
0850                                  $j = 0;
0851   
0852                                  do
0853                                  {
0854                                      // Look for a composite
0855                                      if (isset($utf_canonical_comp[$starter . $utf_chars[$j]]))
0856                                      {
0857                                          // Found a composite, replace the starter
0858                                          $starter = $utf_canonical_comp[$starter . $utf_chars[$j]];
0859                                          unset($utf_sort[$cc][$j]);
0860                                      }
0861                                      else
0862                                      {
0863                                          // No composite, all following characters in that class are blocked
0864                                          break;
0865                                      }
0866                                  }
0867                                  while (isset($utf_sort[$cc][++$j]));
0868                              }
0869   
0870                              // Add the starter to the normalized sequence, followed by non-starters in canonical order
0871                              $nf_seq .= $starter;
0872   
0873                              foreach ($utf_sort as $utf_chars)
0874                              {
0875                                  if (!empty($utf_chars))
0876                                  {
0877                                      $nf_seq .= implode('', $utf_chars);
0878                                  }
0879                              }
0880   
0881                              // Reset the array and go on
0882                              $utf_sort = array();
0883                              $starter = $utf_char;
0884                          }
0885                      }
0886                  }
0887                  while ($k <= $k_max);
0888   
0889                  $tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $nf_seq;
0890                  $tmp_pos = $pos;
0891              }
0892              else
0893              {
0894                  // Only a ASCII char can make the program get here
0895                  //
0896                  // First we skip the current byte with ++$pos, then we quickly skip following ASCII chars with strspn().
0897                  //
0898                  // The first two "if"'s here can be removed, with the consequences of being faster on latin text (lots of ASCII) and slower on
0899                  // multi-byte text (where the only ASCII chars are spaces and punctuation)
0900                  if (++$pos != $len)
0901                  {
0902                      if ($str[$pos] < "\x80")
0903                      {
0904                          $pos += strspn($str, UTF8_ASCII_RANGE, ++$pos);
0905                          $buffer[++$i & 7] = $str[$pos - 1];
0906                      }
0907                      else
0908                      {
0909                          $buffer[++$i & 7] = $c;
0910                      }
0911                  }
0912              }
0913          }
0914          while ($pos < $len);
0915   
0916          // Now is time to return the string
0917          if ($tmp_pos)
0918          {
0919              // If the $tmp_pos cursor is not at the beggining of the string then at least one character was not in normal form. Replace $str with the fixed version
0920              if ($tmp_pos == $len)
0921              {
0922                  // The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str
0923                  return $tmp;
0924              }
0925              else
0926              {
0927                  // The rightmost chunk of $str has not been appended to $tmp yet
0928                  return $tmp . substr($str, $tmp_pos);
0929              }
0930          }
0931   
0932          // The string was already in normal form
0933          return $str;
0934      }
0935   
0936      /**
0937      * Decompose a UTF string
0938      *
0939      * @param    string    $str            UTF string
0940      * @param    integer    $pos            Position of the first UTF char (in bytes)
0941      * @param    integer    $len            Length of the string (in bytes)
0942      * @param    array    &$decomp_map    Decomposition mapping, passed by reference but never modified
0943      * @return    string                    The string, decomposed and sorted canonically
0944      *
0945      * @access    private
0946      */
0947      function decompose($str, $pos, $len, &$decomp_map)
0948      {
0949          global $utf_combining_class;
0950   
0951          // Load some commonly-used tables
0952          if (!isset($utf_combining_class))
0953          {
0954              global $phpbb_root_path, $phpEx;
0955              include($phpbb_root_path . 'includes/utf/data/utf_normalizer_common.' . $phpEx);
0956          }
0957   
0958          // UTF char length array
0959          $utf_len_mask = array(
0960              // Leading bytes masks
0961              "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,
0962              // Trailing bytes masks
0963              "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0
0964          );
0965   
0966          // Some extra checks are triggered on the first byte of a UTF sequence
0967          $extra_check = array(
0968              "\xED" => 1, "\xEF" => 1, "\xC0" => 1, "\xC1" => 1, "\xE0" => 1, "\xF0" => 1,
0969              "\xF4" => 1, "\xF5" => 1, "\xF6" => 1, "\xF7" => 1, "\xF8" => 1, "\xF9" => 1,
0970              "\xFA" => 1, "\xFB" => 1, "\xFC" => 1, "\xFD" => 1, "\xFE" => 1, "\xFF" => 1
0971          );
0972   
0973          // These masks are used to check if a UTF sequence is well formed. Here are the only 3 lengths we acknowledge:
0974          //   - 2-byte: 110? ???? 10?? ????
0975          //   - 3-byte: 1110 ???? 10?? ???? 10?? ????
0976          //   - 4-byte: 1111 0??? 10?? ???? 10?? ???? 10?? ????
0977          // Note that 5- and 6- byte sequences are automatically discarded
0978          $utf_validation_mask = array(
0979              2    => "\xE0\xC0",
0980              3    => "\xF0\xC0\xC0",
0981              4    => "\xF8\xC0\xC0\xC0"
0982          );
0983   
0984          $utf_validation_check = array(
0985              2    => "\xC0\x80",
0986              3    => "\xE0\x80\x80",
0987              4    => "\xF0\x80\x80\x80"
0988          );
0989   
0990          $tmp = '';
0991          $starter_pos = $pos;
0992          $tmp_pos = $last_cc = $sort = $dump = 0;
0993          $utf_sort = array();
0994   
0995   
0996          // Main loop
0997          do
0998          {
0999              // STEP 0: Capture the current char
1000   
1001              $cur_mask = $str[$pos] & "\xF0";
1002              if (isset($utf_len_mask[$cur_mask]))
1003              {
1004                  if ($utf_len = $utf_len_mask[$cur_mask])
1005                  {
1006                      // Multibyte char
1007                      $utf_char = substr($str, $pos, $utf_len);
1008                      $pos += $utf_len;
1009                  }
1010                  else
1011                  {
1012                      // A trailing byte came out of nowhere, we will treat it and all following trailing bytes as if each of them was a Unicode
1013                      // replacement char and we will advance the cursor
1014                      $spn = strspn($str, UTF8_TRAILING_BYTES, $pos);
1015   
1016                      if ($dump)
1017                      {
1018                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1019   
1020                          // Dump combiners
1021                          if (!empty($utf_sort))
1022                          {
1023                              if ($sort)
1024                              {
1025                                  ksort($utf_sort);
1026                              }
1027   
1028                              foreach ($utf_sort as $utf_chars)
1029                              {
1030                                  $tmp .= implode('', $utf_chars);
1031                              }
1032                          }
1033   
1034                          $tmp .= str_repeat(UTF8_REPLACEMENT, $spn);
1035                          $dump = $sort = 0;
1036                      }
1037                      else
1038                      {
1039                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn);
1040                      }
1041   
1042                      $pos += $spn;
1043                      $tmp_pos = $starter_pos = $pos;
1044   
1045                      $utf_sort = array();
1046                      $last_cc = 0;
1047   
1048                      continue;
1049                  }
1050   
1051   
1052                  // STEP 1: Decide what to do with current char
1053   
1054                  // Now, in that order:
1055                  //  - check if that character is decomposable
1056                  //  - check if that character is a non-starter
1057                  //  - check if that character requires extra checks to be performed
1058                  if (isset($decomp_map[$utf_char]))
1059                  {
1060                      // Decompose the char
1061                      $_pos = 0;
1062                      $_len = strlen($decomp_map[$utf_char]);
1063   
1064                      do
1065                      {
1066                          $c = $decomp_map[$utf_char][$_pos];
1067                          $_utf_len =& $utf_len_mask[$c & "\xF0"];
1068   
1069                          if (isset($_utf_len))
1070                          {
1071                              $_utf_char = substr($decomp_map[$utf_char], $_pos, $_utf_len);
1072                              $_pos += $_utf_len;
1073   
1074                              if (isset($utf_combining_class[$_utf_char]))
1075                              {
1076                                  // The character decomposed to a non-starter, buffer it for sorting
1077                                  $utf_sort[$utf_combining_class[$_utf_char]][] = $_utf_char;
1078   
1079                                  if ($utf_combining_class[$_utf_char] < $last_cc)
1080                                  {
1081                                      // Not canonically ordered, will require sorting
1082                                      $sort = $dump = 1;
1083                                  }
1084                                  else
1085                                  {
1086                                      $dump = 1;
1087                                      $last_cc = $utf_combining_class[$_utf_char];
1088                                  }
1089                              }
1090                              else
1091                              {
1092                                  // This character decomposition contains a starter, dump the buffer and continue
1093                                  if ($dump)
1094                                  {
1095                                      $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1096   
1097                                      // Dump combiners
1098                                      if (!empty($utf_sort))
1099                                      {
1100                                          if ($sort)
1101                                          {
1102                                              ksort($utf_sort);
1103                                          }
1104   
1105                                          foreach ($utf_sort as $utf_chars)
1106                                          {
1107                                              $tmp .= implode('', $utf_chars);
1108                                          }
1109                                      }
1110   
1111                                      $tmp .= $_utf_char;
1112                                      $dump = $sort = 0;
1113                                  }
1114                                  else
1115                                  {
1116                                      $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos) . $_utf_char;
1117                                  }
1118   
1119                                  $tmp_pos = $starter_pos = $pos;
1120                                  $utf_sort = array();
1121                                  $last_cc = 0;
1122                              }
1123                          }
1124                          else
1125                          {
1126                              // This character decomposition contains an ASCII char, which is a starter. Dump the buffer and continue
1127                              ++$_pos;
1128   
1129                              if ($dump)
1130                              {
1131                                  $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1132   
1133                                  // Dump combiners
1134                                  if (!empty($utf_sort))
1135                                  {
1136                                      if ($sort)
1137                                      {
1138                                          ksort($utf_sort);
1139                                      }
1140   
1141                                      foreach ($utf_sort as $utf_chars)
1142                                      {
1143                                          $tmp .= implode('', $utf_chars);
1144                                      }
1145                                  }
1146   
1147                                  $tmp .= $c;
1148                                  $dump = $sort = 0;
1149                              }
1150                              else
1151                              {
1152                                  $tmp .= substr($str, $tmp_pos, $pos - $utf_len - $tmp_pos) . $c;
1153                              }
1154   
1155                              $tmp_pos = $starter_pos = $pos;
1156                              $utf_sort = array();
1157                              $last_cc = 0;
1158                          }
1159                      }
1160                      while ($_pos < $_len);
1161                  }
1162                  else if (isset($utf_combining_class[$utf_char]))
1163                  {
1164                      // Combining character
1165                      if ($utf_combining_class[$utf_char] < $last_cc)
1166                      {
1167                          // Not in canonical order
1168                          $sort = $dump = 1;
1169                      }
1170                      else
1171                      {
1172                          $last_cc = $utf_combining_class[$utf_char];
1173                      }
1174   
1175                      $utf_sort[$utf_combining_class[$utf_char]][] = $utf_char;
1176                  }
1177                  else
1178                  {
1179                      // Non-decomposable starter, check out if it's a Hangul syllable
1180                      if ($utf_char < UTF8_HANGUL_FIRST || $utf_char > UTF8_HANGUL_LAST)
1181                      {
1182                          // Nope, regular UTF char, check that we have the correct number of trailing bytes
1183                          if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len])
1184                          {
1185                              // Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char
1186                              // has been encoded in a five- or six- byte sequence.
1187                              // Move the cursor back to its original position then advance it to the position it should really be at
1188                              $pos -= $utf_len;
1189                              $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1190   
1191                              if (!empty($utf_sort))
1192                              {
1193                                  ksort($utf_sort);
1194   
1195                                  foreach ($utf_sort as $utf_chars)
1196                                  {
1197                                      $tmp .= implode('', $utf_chars);
1198                                  }
1199                                  $utf_sort = array();
1200                              }
1201   
1202                              // Add a replacement char then another replacement char for every trailing byte.
1203                              //
1204                              // @todo I'm not entirely sure that's how we're supposed to mark invalidated byte sequences, check this
1205                              $spn = strspn($str, UTF8_TRAILING_BYTES, ++$pos);
1206                              $tmp .= str_repeat(UTF8_REPLACEMENT, $spn + 1);
1207   
1208                              $dump = $sort = 0;
1209   
1210                              $pos += $spn;
1211                              $tmp_pos = $pos;
1212                              continue;
1213                          }
1214   
1215                          if (isset($extra_check[$utf_char[0]]))
1216                          {
1217                              switch ($utf_char[0])
1218                              {
1219                                  // Note: 0xED is quite common in Korean
1220                                  case "\xED":
1221                                      if ($utf_char >= "\xED\xA0\x80")
1222                                      {
1223                                          // Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF)
1224                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1225   
1226                                          if (!empty($utf_sort))
1227                                          {
1228                                              ksort($utf_sort);
1229   
1230                                              foreach ($utf_sort as $utf_chars)
1231                                              {
1232                                                  $tmp .= implode('', $utf_chars);
1233                                              }
1234                                              $utf_sort = array();
1235                                          }
1236   
1237                                          $tmp .= UTF8_REPLACEMENT;
1238                                          $dump = $sort = 0;
1239   
1240                                          $tmp_pos = $starter_pos = $pos;
1241                                          continue 2;
1242                                      }
1243                                  break;
1244   
1245                                  // Note: 0xEF is quite common in Japanese
1246                                  case "\xEF":
1247                                      if ($utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF")
1248                                      {
1249                                          // U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF)
1250                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1251   
1252                                          if (!empty($utf_sort))
1253                                          {
1254                                              ksort($utf_sort);
1255   
1256                                              foreach ($utf_sort as $utf_chars)
1257                                              {
1258                                                  $tmp .= implode('', $utf_chars);
1259                                              }
1260                                              $utf_sort = array();
1261                                          }
1262   
1263                                          $tmp .= UTF8_REPLACEMENT;
1264                                          $dump = $sort = 0;
1265   
1266                                          $tmp_pos = $starter_pos = $pos;
1267                                          continue 2;
1268                                      }
1269                                  break;
1270   
1271                                  case "\xC0":
1272                                  case "\xC1":
1273                                      if ($utf_char <= "\xC1\xBF")
1274                                      {
1275                                          // Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char
1276                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1277   
1278                                          if (!empty($utf_sort))
1279                                          {
1280                                              ksort($utf_sort);
1281   
1282                                              foreach ($utf_sort as $utf_chars)
1283                                              {
1284                                                  $tmp .= implode('', $utf_chars);
1285                                              }
1286                                              $utf_sort = array();
1287                                          }
1288   
1289                                          $tmp .= UTF8_REPLACEMENT;
1290                                          $dump = $sort = 0;
1291   
1292                                          $tmp_pos = $starter_pos = $pos;
1293                                          continue 2;
1294                                      }
1295                                  break;
1296   
1297                                  case "\xE0":
1298                                      if ($utf_char <= "\xE0\x9F\xBF")
1299                                      {
1300                                          // Unicode char U+0000..U+07FF encoded in 3 bytes
1301                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1302   
1303                                          if (!empty($utf_sort))
1304                                          {
1305                                              ksort($utf_sort);
1306   
1307                                              foreach ($utf_sort as $utf_chars)
1308                                              {
1309                                                  $tmp .= implode('', $utf_chars);
1310                                              }
1311                                              $utf_sort = array();
1312                                          }
1313   
1314                                          $tmp .= UTF8_REPLACEMENT;
1315                                          $dump = $sort = 0;
1316   
1317                                          $tmp_pos = $starter_pos = $pos;
1318                                          continue 2;
1319                                      }
1320                                  break;
1321   
1322                                  case "\xF0":
1323                                      if ($utf_char <= "\xF0\x8F\xBF\xBF")
1324                                      {
1325                                          // Unicode char U+0000..U+FFFF encoded in 4 bytes
1326                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1327   
1328                                          if (!empty($utf_sort))
1329                                          {
1330                                              ksort($utf_sort);
1331   
1332                                              foreach ($utf_sort as $utf_chars)
1333                                              {
1334                                                  $tmp .= implode('', $utf_chars);
1335                                              }
1336                                              $utf_sort = array();
1337                                          }
1338   
1339                                          $tmp .= UTF8_REPLACEMENT;
1340                                          $dump = $sort = 0;
1341   
1342                                          $tmp_pos = $starter_pos = $pos;
1343                                          continue 2;
1344                                      }
1345                                  break;
1346   
1347                                  default:
1348                                      if ($utf_char > UTF8_MAX)
1349                                      {
1350                                          // Out of the Unicode range
1351                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1352   
1353                                          if (!empty($utf_sort))
1354                                          {
1355                                              ksort($utf_sort);
1356   
1357                                              foreach ($utf_sort as $utf_chars)
1358                                              {
1359                                                  $tmp .= implode('', $utf_chars);
1360                                              }
1361                                              $utf_sort = array();
1362                                          }
1363   
1364                                          $tmp .= UTF8_REPLACEMENT;
1365                                          $dump = $sort = 0;
1366   
1367                                          $tmp_pos = $starter_pos = $pos;
1368                                          continue 2;
1369                                      }
1370                                  break;
1371                              }
1372                          }
1373                      }
1374                      else
1375                      {
1376                          // Hangul syllable
1377                          $idx = (((ord($utf_char[0]) & 0x0F) << 12) | ((ord($utf_char[1]) & 0x3F) << 6) | (ord($utf_char[2]) & 0x3F)) - UNICODE_HANGUL_SBASE;
1378   
1379                          // LIndex can only range from 0 to 18, therefore it cannot influence the first two bytes of the L Jamo, which allows us to hardcode them (based on LBase).
1380                          //
1381                          // The same goes for VIndex, but for TIndex there's a catch: the value of the third byte could exceed 0xBF and we would have to increment the second byte
1382                          if ($t_index = $idx % UNICODE_HANGUL_TCOUNT)
1383                          {
1384                              if ($t_index < 25)
1385                              {
1386                                  $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x86\x00";
1387                                  $utf_char[8] = chr(0xA7 + $t_index);
1388                              }
1389                              else
1390                              {
1391                                  $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x87\x00";
1392                                  $utf_char[8] = chr(0x67 + $t_index);
1393                              }
1394                          }
1395                          else
1396                          {
1397                              $utf_char = "\xE1\x84\x00\xE1\x85\x00";
1398                          }
1399   
1400                          $utf_char[2] = chr(0x80 + (int) ($idx / UNICODE_HANGUL_NCOUNT));
1401                          $utf_char[5] = chr(0xA1 + (int) (($idx % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT));
1402   
1403                          // Just like other decompositions, the resulting Jamos must be dumped to the tmp string
1404                          $dump = 1;
1405                      }
1406   
1407                      // Do we need to dump stuff to the tmp string?
1408                      if ($dump)
1409                      {
1410                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1411   
1412                          // Dump combiners
1413                          if (!empty($utf_sort))
1414                          {
1415                              if ($sort)
1416                              {
1417                                  ksort($utf_sort);
1418                              }
1419   
1420                              foreach ($utf_sort as $utf_chars)
1421                              {
1422                                  $tmp .= implode('', $utf_chars);
1423                              }
1424                          }
1425   
1426                          $tmp .= $utf_char;
1427                          $dump = $sort = 0;
1428                          $tmp_pos = $pos;
1429                      }
1430   
1431                      $last_cc = 0;
1432                      $utf_sort = array();
1433                      $starter_pos = $pos;
1434                  }
1435              }
1436              else
1437              {
1438                  // ASCII char, which happens to be a starter (as any other ASCII char)
1439                  if ($dump)
1440                  {
1441                      $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1442   
1443                      // Dump combiners
1444                      if (!empty($utf_sort))
1445                      {
1446                          if ($sort)
1447                          {
1448                              ksort($utf_sort);
1449                          }
1450   
1451                          foreach ($utf_sort as $utf_chars)
1452                          {
1453                              $tmp .= implode('', $utf_chars);
1454                          }
1455                      }
1456   
1457                      $tmp .= $str[$pos];
1458                      $dump = $sort = 0;
1459                      $tmp_pos = ++$pos;
1460   
1461                      $pos += strspn($str, UTF8_ASCII_RANGE, $pos);
1462                  }
1463                  else
1464                  {
1465                      $pos += strspn($str, UTF8_ASCII_RANGE, ++$pos);
1466                  }
1467   
1468                  $last_cc = 0;
1469                  $utf_sort = array();
1470                  $starter_pos = $pos;
1471              }
1472          }
1473          while ($pos < $len);
1474   
1475          // Now is time to return the string
1476          if ($dump)
1477          {
1478              $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);
1479   
1480              // Dump combiners
1481              if (!empty($utf_sort))
1482              {
1483                  if ($sort)
1484                  {
1485                      ksort($utf_sort);
1486                  }
1487   
1488                  foreach ($utf_sort as $utf_chars)
1489                  {
1490                      $tmp .= implode('', $utf_chars);
1491                  }
1492              }
1493   
1494              return $tmp;
1495          }
1496          else if ($tmp_pos)
1497          {
1498              // If the $tmp_pos cursor was moved then at least one character was not in normal form. Replace $str with the fixed version
1499              if ($tmp_pos == $len)
1500              {
1501                  // The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str
1502                  return $tmp;
1503              }
1504              else
1505              {
1506                  // The rightmost chunk of $str has not been appended to $tmp yet
1507                  return $tmp . substr($str, $tmp_pos);
1508              }
1509          }
1510   
1511          // The string was already in normal form
1512          return $str;
1513      }
1514  }
1515   
1516  ?>