wimCMS • Blick zurück von phpBB 1.0.0 bis heute

Verzeichnisstruktur phpBB-3.0.0

Veröffentlicht: 12.12.2007

So funktioniert es

Verzeichnis-Info phpBB-1.0.0 / auth.php	Auf das letzte Element klicken. Dies geht jeweils ein Schritt zurück
admin	Auf das Icon klicken, dies öffnet das Verzeichnis. Nochmal klicken schließt das Verzeichnis. Auf den Verzeichnisnamen klicken, dies zeigt nur das Verzeichnis mit Inhalt an
(Beispiel Datei-Icons)	Auf das Icon klicken um den Quellcode anzuzeigen

utf_normalizer.php

Zuletzt modifiziert: 09.10.2024, 12:51 - Dateigröße: 41.73 KiB


     0001  <?php

     0002  /**

     0003  *

     0004  * @package utf

     0005  * @version $Id$

     0006  * @copyright (c) 2005 phpBB Group

     0007  * @license http://opensource.org/licenses/gpl-license.php GNU Public License

     0008  *

     0009  */

     0010   

     0011  /**

     0012  */

     0013  if (!defined('IN_PHPBB'))

     0014  {

     0015      exit;

     0016  }

     0017   

     0018  /**

     0019  * Some Unicode characters encoded in UTF-8

     0020  *

     0021  * Preserved for compatibility

     0022  */

     0023  define('UTF8_REPLACEMENT', "\xEF\xBF\xBD");

     0024  define('UTF8_MAX', "\xF4\x8F\xBF\xBF");

     0025  define('UTF8_FFFE', "\xEF\xBF\xBE");

     0026  define('UTF8_FFFF', "\xEF\xBF\xBF");

     0027  define('UTF8_SURROGATE_FIRST', "\xED\xA0\x80");

     0028  define('UTF8_SURROGATE_LAST', "\xED\xBF\xBF");

     0029  define('UTF8_HANGUL_FIRST', "\xEA\xB0\x80");

     0030  define('UTF8_HANGUL_LAST', "\xED\x9E\xA3");

     0031   

     0032  define('UTF8_CJK_FIRST', "\xE4\xB8\x80");

     0033  define('UTF8_CJK_LAST', "\xE9\xBE\xBB");

     0034  define('UTF8_CJK_B_FIRST', "\xF0\xA0\x80\x80");

     0035  define('UTF8_CJK_B_LAST', "\xF0\xAA\x9B\x96");

     0036   

     0037  // Unset global variables

     0038  unset($GLOBALS['utf_jamo_index'], $GLOBALS['utf_jamo_type'], $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_combining_class'], $GLOBALS['utf_canonical_comp'], $GLOBALS['utf_canonical_decomp'], $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);

     0039   

     0040  // NFC_QC and NFKC_QC values

     0041  define('UNICODE_QC_MAYBE', 0);

     0042  define('UNICODE_QC_NO', 1);

     0043   

     0044  // Contains all the ASCII characters appearing in UTF-8, sorted by frequency

     0045  define('UTF8_ASCII_RANGE', "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F");

     0046   

     0047  // Contains all the tail bytes that can appear in the composition of a UTF-8 char

     0048  define('UTF8_TRAILING_BYTES', "\xA9\xA0\xA8\x80\xAA\x99\xA7\xBB\xAB\x89\x94\x82\xB4\xA2\xAE\x83\xB0\xB9\xB8\x93\xAF\xBC\xB3\x81\xA4\xB2\x9C\xA1\xB5\xBE\xBD\xBA\x98\xAD\xB1\x84\x95\xA6\xB6\x88\x8D\x90\xB7\xBF\x92\x85\xA5\x97\x8C\x86\xA3\x8E\x9F\x8F\x87\x91\x9D\xAC\x9E\x8B\x96\x9B\x8A\x9A");

     0049   

     0050  // Constants used by the Hangul [de]composition algorithms

     0051  define('UNICODE_HANGUL_SBASE', 0xAC00);

     0052  define('UNICODE_HANGUL_LBASE', 0x1100);

     0053  define('UNICODE_HANGUL_VBASE', 0x1161);

     0054  define('UNICODE_HANGUL_TBASE', 0x11A7);

     0055  define('UNICODE_HANGUL_SCOUNT', 11172);

     0056  define('UNICODE_HANGUL_LCOUNT', 19);

     0057  define('UNICODE_HANGUL_VCOUNT', 21);

     0058  define('UNICODE_HANGUL_TCOUNT', 28);

     0059  define('UNICODE_HANGUL_NCOUNT', 588);

     0060  define('UNICODE_JAMO_L', 0);

     0061  define('UNICODE_JAMO_V', 1);

     0062  define('UNICODE_JAMO_T', 2);

     0063   

     0064  /**

     0065  * Unicode normalization routines

     0066  *

     0067  * @package utf

     0068  */

     0069  class utf_normalizer

     0070  {

     0071      /**

     0072      * Validate, cleanup and normalize a string

     0073      *

     0074      * The ultimate convenience function! Clean up invalid UTF-8 sequences,

     0075      * and convert to Normal Form C, canonical composition.

     0076      *

     0077      * @param    string    &$str    The dirty string

     0078      * @return    string            The same string, all shiny and cleaned-up

     0079      */

     0080      function cleanup(&$str)

     0081      {

     0082          // The string below is the list of all autorized characters, sorted by frequency in latin text

     0083          $pos = strspn($str, "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x0D");

     0084          $len = strlen($str);

     0085   

     0086          if ($pos == $len)

     0087          {

     0088              // ASCII strings with no special chars return immediately

     0089              return;

     0090          }

     0091   

     0092          // Note: we do not check for $GLOBALS['utf_canonical_decomp']. It is assumed they are always loaded together

     0093          if (!isset($GLOBALS['utf_nfc_qc']))

     0094          {

     0095              global $phpbb_root_path, $phpEx;

     0096              include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);

     0097          }

     0098   

     0099          if (!isset($GLOBALS['utf_canonical_decomp']))

     0100          {

     0101              global $phpbb_root_path, $phpEx;

     0102              include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);

     0103          }

     0104   

     0105          // Replace any byte in the range 0x00..0x1F, except for \r, \n and \t

     0106          // We replace those characters with a 0xFF byte, which is illegal in UTF-8 and will in turn be replaced with a UTF replacement char

     0107          $str = strtr(

     0108              $str,

     0109              "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",

     0110              "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"

     0111          );

     0112   

     0113          $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);

     0114      }

     0115   

     0116      /**

     0117      * Validate and normalize a UTF string to NFC

     0118      *

     0119      * @param    string    &$str    Unchecked UTF string

     0120      * @return    string            The string, validated and in normal form

     0121      */

     0122      function nfc(&$str)

     0123      {

     0124          $pos = strspn($str, UTF8_ASCII_RANGE);

     0125          $len = strlen($str);

     0126   

     0127          if ($pos == $len)

     0128          {

     0129              // ASCII strings return immediately

     0130              return;

     0131          }

     0132   

     0133          if (!isset($GLOBALS['utf_nfc_qc']))

     0134          {

     0135              global $phpbb_root_path, $phpEx;

     0136              include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);

     0137          }

     0138   

     0139          if (!isset($GLOBALS['utf_canonical_decomp']))

     0140          {

     0141              global $phpbb_root_path, $phpEx;

     0142              include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);

     0143          }

     0144   

     0145          $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);

     0146      }

     0147   

     0148      /**

     0149      * Validate and normalize a UTF string to NFKC

     0150      *

     0151      * @param    string    &$str    Unchecked UTF string

     0152      * @return    string            The string, validated and in normal form

     0153      */

     0154      function nfkc(&$str)

     0155      {

     0156          $pos = strspn($str, UTF8_ASCII_RANGE);

     0157          $len = strlen($str);

     0158   

     0159          if ($pos == $len)

     0160          {

     0161              // ASCII strings return immediately

     0162              return;

     0163          }

     0164   

     0165          if (!isset($GLOBALS['utf_nfkc_qc']))

     0166          {

     0167              global $phpbb_root_path, $phpEx;

     0168              include($phpbb_root_path . 'includes/utf/data/utf_nfkc_qc.' . $phpEx);

     0169          }

     0170   

     0171          if (!isset($GLOBALS['utf_compatibility_decomp']))

     0172          {

     0173              global $phpbb_root_path, $phpEx;

     0174              include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);

     0175          }

     0176   

     0177          $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);

     0178      }

     0179   

     0180      /**

     0181      * Validate and normalize a UTF string to NFD

     0182      *

     0183      * @param    string    &$str    Unchecked UTF string

     0184      * @return    string            The string, validated and in normal form

     0185      */

     0186      function nfd(&$str)

     0187      {

     0188          $pos = strspn($str, UTF8_ASCII_RANGE);

     0189          $len = strlen($str);

     0190   

     0191          if ($pos == $len)

     0192          {

     0193              // ASCII strings return immediately

     0194              return;

     0195          }

     0196   

     0197          if (!isset($GLOBALS['utf_canonical_decomp']))

     0198          {

     0199              global $phpbb_root_path, $phpEx;

     0200              include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);

     0201          }

     0202   

     0203          $str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_canonical_decomp']);

     0204      }

     0205   

     0206      /**

     0207      * Validate and normalize a UTF string to NFKD

     0208      *

     0209      * @param    string    &$str    Unchecked UTF string

     0210      * @return    string            The string, validated and in normal form

     0211      */

     0212      function nfkd(&$str)

     0213      {

     0214          $pos = strspn($str, UTF8_ASCII_RANGE);

     0215          $len = strlen($str);

     0216   

     0217          if ($pos == $len)

     0218          {

     0219              // ASCII strings return immediately

     0220              return;

     0221          }

     0222   

     0223          if (!isset($GLOBALS['utf_compatibility_decomp']))

     0224          {

     0225              global $phpbb_root_path, $phpEx;

     0226              include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);

     0227          }

     0228   

     0229          $str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_compatibility_decomp']);

     0230      }

     0231   

     0232   

     0233      /**

     0234      * Recompose a UTF string

     0235      *

     0236      * @param    string    $str            Unchecked UTF string

     0237      * @param    integer    $pos            Position of the first UTF char (in bytes)

     0238      * @param    integer    $len            Length of the string (in bytes)

     0239      * @param    array    &$qc            Quick-check array, passed by reference but never modified

     0240      * @param    array    &$decomp_map    Decomposition mapping, passed by reference but never modified

     0241      * @return    string                    The string, validated and recomposed

     0242      *

     0243      * @access    private

     0244      */

     0245      function recompose($str, $pos, $len, &$qc, &$decomp_map)

     0246      {

     0247          global $utf_combining_class, $utf_canonical_comp, $utf_jamo_type, $utf_jamo_index;

     0248   

     0249          // Load some commonly-used tables

     0250          if (!isset($utf_jamo_index, $utf_jamo_type, $utf_combining_class))

     0251          {

     0252              global $phpbb_root_path, $phpEx;

     0253              include($phpbb_root_path . 'includes/utf/data/utf_normalizer_common.' . $phpEx);

     0254          }

     0255   

     0256          // Load the canonical composition table

     0257          if (!isset($utf_canonical_comp))

     0258          {

     0259              global $phpbb_root_path, $phpEx;

     0260              include($phpbb_root_path . 'includes/utf/data/utf_canonical_comp.' . $phpEx);

     0261          }

     0262   

     0263          // Buffer the last ASCII char before the UTF-8 stuff if applicable

     0264          $tmp = '';

     0265          $i = $tmp_pos = $last_cc = 0;

     0266   

     0267          $buffer = ($pos) ? array(++$i => $str[$pos - 1]) : array();

     0268   

     0269          // UTF char length array

     0270          // This array is used to determine the length of a UTF character.

     0271          // Be $c the result of ($str[$pos] & "\xF0") --where $str is the string we're operating on and $pos

     0272          // the position of the cursor--, if $utf_len_mask[$c] does not exist, the byte is an ASCII char.

     0273          // Otherwise, if $utf_len_mask[$c] is greater than 0, we have a the leading byte of a multibyte character

     0274          // whose length is $utf_len_mask[$c] and if it is equal to 0, the byte is a trailing byte.

     0275          $utf_len_mask = array(

     0276              // Leading bytes masks

     0277              "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,

     0278              // Trailing bytes masks

     0279              "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0

     0280          );

     0281   

     0282          $extra_check = array(

     0283              "\xED" => 1, "\xEF" => 1, "\xC0" => 1, "\xC1" => 1, "\xE0" => 1, "\xF0" => 1,

     0284              "\xF4" => 1, "\xF5" => 1, "\xF6" => 1, "\xF7" => 1, "\xF8" => 1, "\xF9" => 1,

     0285              "\xFA" => 1, "\xFB" => 1, "\xFC" => 1, "\xFD" => 1, "\xFE" => 1, "\xFF" => 1

     0286          );

     0287   

     0288          $utf_validation_mask = array(

     0289              2    => "\xE0\xC0",

     0290              3    => "\xF0\xC0\xC0",

     0291              4    => "\xF8\xC0\xC0\xC0"

     0292          );

     0293   

     0294          $utf_validation_check = array(

     0295              2    => "\xC0\x80",

     0296              3    => "\xE0\x80\x80",

     0297              4    => "\xF0\x80\x80\x80"

     0298          );

     0299   

     0300          // Main loop

     0301          do

     0302          {

     0303              // STEP 0: Capture the current char and buffer it

     0304              $c = $str[$pos];

     0305              $c_mask = $c & "\xF0";

     0306   

     0307              if (isset($utf_len_mask[$c_mask]))

     0308              {

     0309                  // Byte at $pos is either a leading byte or a missplaced trailing byte

     0310                  if ($utf_len = $utf_len_mask[$c_mask])

     0311                  {

     0312                      // Capture the char

     0313                      $buffer[++$i & 7] = $utf_char = substr($str, $pos, $utf_len);

     0314   

     0315                      // Let's find out if a thorough check is needed

     0316                      if (isset($qc[$utf_char]))

     0317                      {

     0318                          // If the UTF char is in the qc array then it may not be in normal form. We do nothing here, the actual processing is below this "if" block

     0319                      }

     0320                      else if (isset($utf_combining_class[$utf_char]))

     0321                      {

     0322                          if ($utf_combining_class[$utf_char] < $last_cc)

     0323                          {

     0324                              // A combining character that is NOT canonically ordered

     0325                          }

     0326                          else

     0327                          {

     0328                              // A combining character that IS canonically ordered, skip to the next char

     0329                              $last_cc = $utf_combining_class[$utf_char];

     0330   

     0331                              $pos += $utf_len;

     0332                              continue;

     0333                          }

     0334                      }

     0335                      else

     0336                      {

     0337                          // At this point, $utf_char holds a UTF char that we know is not a NF[K]C_QC and is not a combining character.

     0338                          // It can be a singleton, a canonical composite, a replacement char or an even an ill-formed bunch of bytes. Let's find out

     0339                          $last_cc = 0;

     0340   

     0341                          // Check that we have the correct number of trailing bytes

     0342                          if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len])

     0343                          {

     0344                              // Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char

     0345                              // has been encoded in a five- or six- byte sequence

     0346                              if ($utf_char[0] >= "\xF8")

     0347                              {

     0348                                  if ($utf_char[0] < "\xFC")

     0349                                  {

     0350                                      $trailing_bytes = 4;

     0351                                  }

     0352                                  else if ($utf_char[0] > "\xFD")

     0353                                  {

     0354                                      $trailing_bytes = 0;

     0355                                  }

     0356                                  else

     0357                                  {

     0358                                      $trailing_bytes = 5;

     0359                                  }

     0360                              }

     0361                              else

     0362                              {

     0363                                  $trailing_bytes = $utf_len - 1;

     0364                              }

     0365   

     0366                              $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;

     0367                              $pos += strspn($str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes);

     0368                              $tmp_pos = $pos;

     0369   

     0370                              continue;

     0371                          }

     0372   

     0373                          if (isset($extra_check[$c]))

     0374                          {

     0375                              switch ($c)

     0376                              {

     0377                                  // Note: 0xED is quite common in Korean

     0378                                  case "\xED":

     0379                                      if ($utf_char >= "\xED\xA0\x80")

     0380                                      {

     0381                                          // Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF)

     0382                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;

     0383                                          $pos += $utf_len;

     0384                                          $tmp_pos = $pos;

     0385                                          continue 2;

     0386                                      }

     0387                                  break;

     0388   

     0389                                  // Note: 0xEF is quite common in Japanese

     0390                                  case "\xEF":

     0391                                      if ($utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF")

     0392                                      {

     0393                                          // U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF)

     0394                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;

     0395                                          $pos += $utf_len;

     0396                                          $tmp_pos = $pos;

     0397                                          continue 2;

     0398                                      }

     0399                                  break;

     0400   

     0401                                  case "\xC0":

     0402                                  case "\xC1":

     0403                                      if ($utf_char <= "\xC1\xBF")

     0404                                      {

     0405                                          // Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char

     0406                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;

     0407                                          $pos += $utf_len;

     0408                                          $tmp_pos = $pos;

     0409                                          continue 2;

     0410                                      }

     0411                                  break;

     0412   

     0413                                  case "\xE0":

     0414                                      if ($utf_char <= "\xE0\x9F\xBF")

     0415                                      {

     0416                                          // Unicode char U+0000..U+07FF encoded in 3 bytes

     0417                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;

     0418                                          $pos += $utf_len;

     0419                                          $tmp_pos = $pos;

     0420                                          continue 2;

     0421                                      }

     0422                                  break;

     0423   

     0424                                  case "\xF0":

     0425                                      if ($utf_char <= "\xF0\x8F\xBF\xBF")

     0426                                      {

     0427                                          // Unicode char U+0000..U+FFFF encoded in 4 bytes

     0428                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;

     0429                                          $pos += $utf_len;

     0430                                          $tmp_pos = $pos;

     0431                                          continue 2;

     0432                                      }

     0433                                  break;

     0434   

     0435                                  default:

     0436                                      // Five- and six- byte sequences do not need being checked for here anymore

     0437                                      if ($utf_char > UTF8_MAX)

     0438                                      {

     0439                                          // Out of the Unicode range

     0440                                          if ($utf_char[0] < "\xF8")

     0441                                          {

     0442                                              $trailing_bytes = 3;

     0443                                          }

     0444                                          else if ($utf_char[0] < "\xFC")

     0445                                          {

     0446                                              $trailing_bytes = 4;

     0447                                          }

     0448                                          else if ($utf_char[0] > "\xFD")

     0449                                          {

     0450                                              $trailing_bytes = 0;

     0451                                          }

     0452                                          else

     0453                                          {

     0454                                              $trailing_bytes = 5;

     0455                                          }

     0456   

     0457                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;

     0458                                          $pos += strspn($str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes);

     0459                                          $tmp_pos = $pos;

     0460                                          continue 2;

     0461                                      }

     0462                                  break;

     0463                              }

     0464                          }

     0465   

     0466                          // The char is a valid starter, move the cursor and go on

     0467                          $pos += $utf_len;

     0468                          continue;

     0469                      }

     0470                  }

     0471                  else

     0472                  {

     0473                      // A trailing byte came out of nowhere, we will advance the cursor and treat the this byte and all following trailing bytes as if

     0474                      // each of them was a Unicode replacement char

     0475                      $spn = strspn($str, UTF8_TRAILING_BYTES, $pos);

     0476                      $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn);

     0477   

     0478                      $pos += $spn;

     0479                      $tmp_pos = $pos;

     0480                      continue;

     0481                  }

     0482   

     0483   

     0484                  // STEP 1: Decompose current char

     0485   

     0486                  // We have found a character that is either:

     0487                  //  - in the NFC_QC/NFKC_QC list

     0488                  //  - a non-starter char that is not canonically ordered

     0489                  //

     0490                  // We are going to capture the shortest UTF sequence that satisfies these two conditions:

     0491                  //

     0492                  //  1 - If the sequence does not start at the begginning of the string, it must begin with a starter,

     0493                  // and that starter must not have the NF[K]C_QC property equal to "MAYBE"

     0494                  //

     0495                  //  2 - If the sequence does not end at the end of the string, it must end with a non-starter and be

     0496                  // immediately followed by a starter that is not on the QC list

     0497                  //

     0498                  $utf_seq = array();

     0499                  $last_cc = 0;

     0500                  $lpos = $pos;

     0501                  $pos += $utf_len;

     0502   

     0503                  if (isset($decomp_map[$utf_char]))

     0504                  {

     0505                      $_pos = 0;

     0506                      $_len = strlen($decomp_map[$utf_char]);

     0507   

     0508                      do

     0509                      {

     0510                          $_utf_len =& $utf_len_mask[$decomp_map[$utf_char][$_pos] & "\xF0"];

     0511   

     0512                          if (isset($_utf_len))

     0513                          {

     0514                              $utf_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);

     0515                              $_pos += $_utf_len;

     0516                          }

     0517                          else

     0518                          {

     0519                              $utf_seq[] = $decomp_map[$utf_char][$_pos];

     0520                              ++$_pos;

     0521                          }

     0522                      }

     0523                      while ($_pos < $_len);

     0524                  }

     0525                  else

     0526                  {

     0527                      // The char is not decomposable

     0528                      $utf_seq = array($utf_char);

     0529                  }

     0530   

     0531   

     0532                  // STEP 2: Capture the starter

     0533   

     0534                  // Check out the combining class of the first character of the UTF sequence

     0535                  $k = 0;

     0536                  if (isset($utf_combining_class[$utf_seq[0]]) || $qc[$utf_char] == UNICODE_QC_MAYBE)

     0537                  {

     0538                      // Not a starter, inspect previous characters

     0539                      // The last 8 characters are kept in a buffer so that we don't have to capture them everytime.

     0540                      // This is enough for all real-life strings but even if it wasn't, we can capture characters in backward mode,

     0541                      // although it is slower than this method.

     0542                      //

     0543                      // In the following loop, $j starts at the previous buffered character ($i - 1, because current character is

     0544                      // at offset $i) and process them in backward mode until we find a starter.

     0545                      //

     0546                      // $k is the index on each UTF character inside of our UTF sequence. At this time, $utf_seq contains one or more

     0547                      // characters numbered 0 to n. $k starts at 0 and for each char we prepend we pre-decrement it and for numbering

     0548                      $starter_found = 0;

     0549                      $j_min = max(1, $i - 7);

     0550   

     0551                      for ($j = $i - 1; $j >= $j_min && $lpos > $tmp_pos; --$j)

     0552                      {

     0553                          $utf_char = $buffer[$j & 7];

     0554                          $lpos -= strlen($utf_char);

     0555   

     0556                          if (isset($decomp_map[$utf_char]))

     0557                          {

     0558                              // The char is a composite, decompose for storage

     0559                              $decomp_seq = array();

     0560                              $_pos = 0;

     0561                              $_len = strlen($decomp_map[$utf_char]);

     0562   

     0563                              do

     0564                              {

     0565                                  $c = $decomp_map[$utf_char][$_pos];

     0566                                  $_utf_len =& $utf_len_mask[$c & "\xF0"];

     0567   

     0568                                  if (isset($_utf_len))

     0569                                  {

     0570                                      $decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);

     0571                                      $_pos += $_utf_len;

     0572                                  }

     0573                                  else

     0574                                  {

     0575                                      $decomp_seq[] = $c;

     0576                                      ++$_pos;

     0577                                  }

     0578                              }

     0579                              while ($_pos < $_len);

     0580   

     0581                              // Prepend the UTF sequence with our decomposed sequence

     0582                              if (isset($decomp_seq[1]))

     0583                              {

     0584                                  // The char expanded into several chars

     0585                                  $decomp_cnt = sizeof($decomp_seq);

     0586   

     0587                                  foreach ($decomp_seq as $decomp_i => $decomp_char)

     0588                                  {

     0589                                      $utf_seq[$k + $decomp_i - $decomp_cnt] = $decomp_char;

     0590                                  }

     0591                                  $k -= $decomp_cnt;

     0592                              }

     0593                              else

     0594                              {

     0595                                  // Decomposed to a single char, easier to prepend

     0596                                  $utf_seq[--$k] = $decomp_seq[0];

     0597                              }

     0598                          }

     0599                          else

     0600                          {

     0601                              $utf_seq[--$k] = $utf_char;

     0602                          }

     0603   

     0604                          if (!isset($utf_combining_class[$utf_seq[$k]]))

     0605                          {

     0606                              // We have found our starter

     0607                              $starter_found = 1;

     0608                              break;

     0609                          }

     0610                      }

     0611   

     0612                      if (!$starter_found && $lpos > $tmp_pos)

     0613                      {

     0614                          // The starter was not found in the buffer, let's rewind some more

     0615                          do

     0616                          {

     0617                              // $utf_len_mask contains the masks of both leading bytes and trailing bytes. If $utf_en > 0 then it's a leading byte, otherwise it's a trailing byte.

     0618                              $c = $str[--$lpos];

     0619                              $c_mask = $c & "\xF0";

     0620   

     0621                              if (isset($utf_len_mask[$c_mask]))

     0622                              {

     0623                                  // UTF byte

     0624                                  if ($utf_len = $utf_len_mask[$c_mask])

     0625                                  {

     0626                                      // UTF *leading* byte

     0627                                      $utf_char = substr($str, $lpos, $utf_len);

     0628   

     0629                                      if (isset($decomp_map[$utf_char]))

     0630                                      {

     0631                                          // Decompose the character

     0632                                          $decomp_seq = array();

     0633                                          $_pos = 0;

     0634                                          $_len = strlen($decomp_map[$utf_char]);

     0635   

     0636                                          do

     0637                                          {

     0638                                              $c = $decomp_map[$utf_char][$_pos];

     0639                                              $_utf_len =& $utf_len_mask[$c & "\xF0"];

     0640   

     0641                                              if (isset($_utf_len))

     0642                                              {

     0643                                                  $decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);

     0644                                                  $_pos += $_utf_len;

     0645                                              }

     0646                                              else

     0647                                              {

     0648                                                  $decomp_seq[] = $c;

     0649                                                  ++$_pos;

     0650                                              }

     0651                                          }

     0652                                          while ($_pos < $_len);

     0653   

     0654                                          // Prepend the UTF sequence with our decomposed sequence

     0655                                          if (isset($decomp_seq[1]))

     0656                                          {

     0657                                              // The char expanded into several chars

     0658                                              $decomp_cnt = sizeof($decomp_seq);

     0659                                              foreach ($decomp_seq as $decomp_i => $utf_char)

     0660                                              {

     0661                                                  $utf_seq[$k + $decomp_i - $decomp_cnt] = $utf_char;

     0662                                              }

     0663                                              $k -= $decomp_cnt;

     0664                                          }

     0665                                          else

     0666                                          {

     0667                                              // Decomposed to a single char, easier to prepend

     0668                                              $utf_seq[--$k] = $decomp_seq[0];

     0669                                          }

     0670                                      }

     0671                                      else

     0672                                      {

     0673                                          $utf_seq[--$k] = $utf_char;

     0674                                      }

     0675                                  }

     0676                              }

     0677                              else

     0678                              {

     0679                                  // ASCII char

     0680                                  $utf_seq[--$k] = $c;

     0681                              }

     0682                          }

     0683                          while ($lpos > $tmp_pos);

     0684                      }

     0685                  }

     0686   

     0687   

     0688                  // STEP 3: Capture following combining modifiers

     0689   

     0690                  while ($pos < $len)

     0691                  {

     0692                      $c_mask = $str[$pos] & "\xF0";

     0693   

     0694                      if (isset($utf_len_mask[$c_mask]))

     0695                      {

     0696                          if ($utf_len = $utf_len_mask[$c_mask])

     0697                          {

     0698                              $utf_char = substr($str, $pos, $utf_len);

     0699                          }

     0700                          else

     0701                          {

     0702                              // A trailing byte came out of nowhere

     0703                              // Trailing bytes are replaced with Unicode replacement chars, we will just ignore it for now, break out of the loop

     0704                              // as if it was a starter (replacement chars ARE starters) and let the next loop replace it

     0705                              break;

     0706                          }

     0707   

     0708                          if (isset($utf_combining_class[$utf_char]) || isset($qc[$utf_char]))

     0709                          {

     0710                              // Combining character, add it to the sequence and move the cursor

     0711                              if (isset($decomp_map[$utf_char]))

     0712                              {

     0713                                  // Decompose the character

     0714                                  $_pos = 0;

     0715                                  $_len = strlen($decomp_map[$utf_char]);

     0716   

     0717                                  do

     0718                                  {

     0719                                      $c = $decomp_map[$utf_char][$_pos];

     0720                                      $_utf_len =& $utf_len_mask[$c & "\xF0"];

     0721   

     0722                                      if (isset($_utf_len))

     0723                                      {

     0724                                          $utf_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);

     0725                                          $_pos += $_utf_len;

     0726                                      }

     0727                                      else

     0728                                      {

     0729                                          $utf_seq[] = $c;

     0730                                          ++$_pos;

     0731                                      }

     0732                                  }

     0733                                  while ($_pos < $_len);

     0734                              }

     0735                              else

     0736                              {

     0737                                  $utf_seq[] = $utf_char;

     0738                              }

     0739   

     0740                              $pos += $utf_len;

     0741                          }

     0742                          else

     0743                          {

     0744                              // Combining class 0 and no QC, break out of the loop

     0745                              // Note: we do not know if that character is valid. If it's not, the next iteration will replace it

     0746                              break;

     0747                          }

     0748                      }

     0749                      else

     0750                      {

     0751                          // ASCII chars are starters

     0752                          break;

     0753                      }

     0754                  }

     0755   

     0756   

     0757                  // STEP 4: Sort and combine

     0758   

     0759                  // Here we sort...

     0760                  $k_max = $k + sizeof($utf_seq);

     0761   

     0762                  if (!$k && $k_max == 1)

     0763                  {

     0764                      // There is only one char in the UTF sequence, add it then jump to the next iteration of main loop

     0765                          // Note: the two commented lines below can be enabled under PHP5 for a very small performance gain in most cases

     0766  //                        if (substr_compare($str, $utf_seq[0], $lpos, $pos - $lpos))

     0767  //                        {

     0768                          $tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $utf_seq[0];

     0769                          $tmp_pos = $pos;

     0770  //                        }

     0771   

     0772                      continue;

     0773                  }

     0774   

     0775                  // ...there we combine

     0776                  if (isset($utf_combining_class[$utf_seq[$k]]))

     0777                  {

     0778                      $starter = $nf_seq = '';

     0779                  }

     0780                  else

     0781                  {

     0782                      $starter = $utf_seq[$k++];

     0783                      $nf_seq = '';

     0784                  }

     0785                  $utf_sort = array();

     0786   

     0787                  // We add an empty char at the end of the UTF char sequence. It will act as a starter and trigger the sort/combine routine

     0788                  // at the end of the string without altering it

     0789                  $utf_seq[] = '';

     0790   

     0791                  do

     0792                  {

     0793                      $utf_char = $utf_seq[$k++];

     0794   

     0795                      if (isset($utf_combining_class[$utf_char]))

     0796                      {

     0797                          $utf_sort[$utf_combining_class[$utf_char]][] = $utf_char;

     0798                      }

     0799                      else

     0800                      {

     0801                          if (empty($utf_sort))

     0802                          {

     0803                              // No combining characters... check for a composite of the two starters

     0804                              if (isset($utf_canonical_comp[$starter . $utf_char]))

     0805                              {

     0806                                  // Good ol' composite character

     0807                                  $starter = $utf_canonical_comp[$starter . $utf_char];

     0808                              }

     0809                              else if (isset($utf_jamo_type[$utf_char]))

     0810                              {

     0811                                  // Current char is a composable jamo

     0812                                  if (isset($utf_jamo_type[$starter]) && $utf_jamo_type[$starter] == UNICODE_JAMO_L && $utf_jamo_type[$utf_char] == UNICODE_JAMO_V)

     0813                                  {

     0814                                      // We have a L jamo followed by a V jamo, we are going to prefetch the next char to see if it's a T jamo

     0815                                      if (isset($utf_jamo_type[$utf_seq[$k]]) && $utf_jamo_type[$utf_seq[$k]] == UNICODE_JAMO_T)

     0816                                      {

     0817                                          // L+V+T jamos, combine to a LVT Hangul syllable ($k is incremented)

     0818                                          $cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char] + $utf_jamo_index[$utf_seq[$k]];

     0819                                          ++$k;

     0820                                      }

     0821                                      else

     0822                                      {

     0823                                          // L+V jamos, combine to a LV Hangul syllable

     0824                                          $cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char];

     0825                                      }

     0826   

     0827                                      $starter = chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));

     0828                                  }

     0829                                  else

     0830                                  {

     0831                                      // Non-composable jamo, just add it to the sequence

     0832                                      $nf_seq .= $starter;

     0833                                      $starter = $utf_char;

     0834                                  }

     0835                              }

     0836                              else

     0837                              {

     0838                                  // No composite, just add the first starter to the sequence then continue with the other one

     0839                                  $nf_seq .= $starter;

     0840                                  $starter = $utf_char;

     0841                              }

     0842                          }

     0843                          else

     0844                          {

     0845                              ksort($utf_sort);

     0846   

     0847                              // For each class of combining characters

     0848                              foreach ($utf_sort as $cc => $utf_chars)

     0849                              {

     0850                                  $j = 0;

     0851   

     0852                                  do

     0853                                  {

     0854                                      // Look for a composite

     0855                                      if (isset($utf_canonical_comp[$starter . $utf_chars[$j]]))

     0856                                      {

     0857                                          // Found a composite, replace the starter

     0858                                          $starter = $utf_canonical_comp[$starter . $utf_chars[$j]];

     0859                                          unset($utf_sort[$cc][$j]);

     0860                                      }

     0861                                      else

     0862                                      {

     0863                                          // No composite, all following characters in that class are blocked

     0864                                          break;

     0865                                      }

     0866                                  }

     0867                                  while (isset($utf_sort[$cc][++$j]));

     0868                              }

     0869   

     0870                              // Add the starter to the normalized sequence, followed by non-starters in canonical order

     0871                              $nf_seq .= $starter;

     0872   

     0873                              foreach ($utf_sort as $utf_chars)

     0874                              {

     0875                                  if (!empty($utf_chars))

     0876                                  {

     0877                                      $nf_seq .= implode('', $utf_chars);

     0878                                  }

     0879                              }

     0880   

     0881                              // Reset the array and go on

     0882                              $utf_sort = array();

     0883                              $starter = $utf_char;

     0884                          }

     0885                      }

     0886                  }

     0887                  while ($k <= $k_max);

     0888   

     0889                  $tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $nf_seq;

     0890                  $tmp_pos = $pos;

     0891              }

     0892              else

     0893              {

     0894                  // Only a ASCII char can make the program get here

     0895                  //

     0896                  // First we skip the current byte with ++$pos, then we quickly skip following ASCII chars with strspn().

     0897                  //

     0898                  // The first two "if"'s here can be removed, with the consequences of being faster on latin text (lots of ASCII) and slower on

     0899                  // multi-byte text (where the only ASCII chars are spaces and punctuation)

     0900                  if (++$pos != $len)

     0901                  {

     0902                      if ($str[$pos] < "\x80")

     0903                      {

     0904                          $pos += strspn($str, UTF8_ASCII_RANGE, ++$pos);

     0905                          $buffer[++$i & 7] = $str[$pos - 1];

     0906                      }

     0907                      else

     0908                      {

     0909                          $buffer[++$i & 7] = $c;

     0910                      }

     0911                  }

     0912              }

     0913          }

     0914          while ($pos < $len);

     0915   

     0916          // Now is time to return the string

     0917          if ($tmp_pos)

     0918          {

     0919              // If the $tmp_pos cursor is not at the beggining of the string then at least one character was not in normal form. Replace $str with the fixed version

     0920              if ($tmp_pos == $len)

     0921              {

     0922                  // The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str

     0923                  return $tmp;

     0924              }

     0925              else

     0926              {

     0927                  // The rightmost chunk of $str has not been appended to $tmp yet

     0928                  return $tmp . substr($str, $tmp_pos);

     0929              }

     0930          }

     0931   

     0932          // The string was already in normal form

     0933          return $str;

     0934      }

     0935   

     0936      /**

     0937      * Decompose a UTF string

     0938      *

     0939      * @param    string    $str            UTF string

     0940      * @param    integer    $pos            Position of the first UTF char (in bytes)

     0941      * @param    integer    $len            Length of the string (in bytes)

     0942      * @param    array    &$decomp_map    Decomposition mapping, passed by reference but never modified

     0943      * @return    string                    The string, decomposed and sorted canonically

     0944      *

     0945      * @access    private

     0946      */

     0947      function decompose($str, $pos, $len, &$decomp_map)

     0948      {

     0949          global $utf_combining_class;

     0950   

     0951          // Load some commonly-used tables

     0952          if (!isset($utf_combining_class))

     0953          {

     0954              global $phpbb_root_path, $phpEx;

     0955              include($phpbb_root_path . 'includes/utf/data/utf_normalizer_common.' . $phpEx);

     0956          }

     0957   

     0958          // UTF char length array

     0959          $utf_len_mask = array(

     0960              // Leading bytes masks

     0961              "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,

     0962              // Trailing bytes masks

     0963              "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0

     0964          );

     0965   

     0966          // Some extra checks are triggered on the first byte of a UTF sequence

     0967          $extra_check = array(

     0968              "\xED" => 1, "\xEF" => 1, "\xC0" => 1, "\xC1" => 1, "\xE0" => 1, "\xF0" => 1,

     0969              "\xF4" => 1, "\xF5" => 1, "\xF6" => 1, "\xF7" => 1, "\xF8" => 1, "\xF9" => 1,

     0970              "\xFA" => 1, "\xFB" => 1, "\xFC" => 1, "\xFD" => 1, "\xFE" => 1, "\xFF" => 1

     0971          );

     0972   

     0973          // These masks are used to check if a UTF sequence is well formed. Here are the only 3 lengths we acknowledge:

     0974          //   - 2-byte: 110? ???? 10?? ????

     0975          //   - 3-byte: 1110 ???? 10?? ???? 10?? ????

     0976          //   - 4-byte: 1111 0??? 10?? ???? 10?? ???? 10?? ????

     0977          // Note that 5- and 6- byte sequences are automatically discarded

     0978          $utf_validation_mask = array(

     0979              2    => "\xE0\xC0",

     0980              3    => "\xF0\xC0\xC0",

     0981              4    => "\xF8\xC0\xC0\xC0"

     0982          );

     0983   

     0984          $utf_validation_check = array(

     0985              2    => "\xC0\x80",

     0986              3    => "\xE0\x80\x80",

     0987              4    => "\xF0\x80\x80\x80"

     0988          );

     0989   

     0990          $tmp = '';

     0991          $starter_pos = $pos;

     0992          $tmp_pos = $last_cc = $sort = $dump = 0;

     0993          $utf_sort = array();

     0994   

     0995   

     0996          // Main loop

     0997          do

     0998          {

     0999              // STEP 0: Capture the current char

     1000   

     1001              $cur_mask = $str[$pos] & "\xF0";

     1002              if (isset($utf_len_mask[$cur_mask]))

     1003              {

     1004                  if ($utf_len = $utf_len_mask[$cur_mask])

     1005                  {

     1006                      // Multibyte char

     1007                      $utf_char = substr($str, $pos, $utf_len);

     1008                      $pos += $utf_len;

     1009                  }

     1010                  else

     1011                  {

     1012                      // A trailing byte came out of nowhere, we will treat it and all following trailing bytes as if each of them was a Unicode

     1013                      // replacement char and we will advance the cursor

     1014                      $spn = strspn($str, UTF8_TRAILING_BYTES, $pos);

     1015   

     1016                      if ($dump)

     1017                      {

     1018                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1019   

     1020                          // Dump combiners

     1021                          if (!empty($utf_sort))

     1022                          {

     1023                              if ($sort)

     1024                              {

     1025                                  ksort($utf_sort);

     1026                              }

     1027   

     1028                              foreach ($utf_sort as $utf_chars)

     1029                              {

     1030                                  $tmp .= implode('', $utf_chars);

     1031                              }

     1032                          }

     1033   

     1034                          $tmp .= str_repeat(UTF8_REPLACEMENT, $spn);

     1035                          $dump = $sort = 0;

     1036                      }

     1037                      else

     1038                      {

     1039                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn);

     1040                      }

     1041   

     1042                      $pos += $spn;

     1043                      $tmp_pos = $starter_pos = $pos;

     1044   

     1045                      $utf_sort = array();

     1046                      $last_cc = 0;

     1047   

     1048                      continue;

     1049                  }

     1050   

     1051   

     1052                  // STEP 1: Decide what to do with current char

     1053   

     1054                  // Now, in that order:

     1055                  //  - check if that character is decomposable

     1056                  //  - check if that character is a non-starter

     1057                  //  - check if that character requires extra checks to be performed

     1058                  if (isset($decomp_map[$utf_char]))

     1059                  {

     1060                      // Decompose the char

     1061                      $_pos = 0;

     1062                      $_len = strlen($decomp_map[$utf_char]);

     1063   

     1064                      do

     1065                      {

     1066                          $c = $decomp_map[$utf_char][$_pos];

     1067                          $_utf_len =& $utf_len_mask[$c & "\xF0"];

     1068   

     1069                          if (isset($_utf_len))

     1070                          {

     1071                              $_utf_char = substr($decomp_map[$utf_char], $_pos, $_utf_len);

     1072                              $_pos += $_utf_len;

     1073   

     1074                              if (isset($utf_combining_class[$_utf_char]))

     1075                              {

     1076                                  // The character decomposed to a non-starter, buffer it for sorting

     1077                                  $utf_sort[$utf_combining_class[$_utf_char]][] = $_utf_char;

     1078   

     1079                                  if ($utf_combining_class[$_utf_char] < $last_cc)

     1080                                  {

     1081                                      // Not canonically ordered, will require sorting

     1082                                      $sort = $dump = 1;

     1083                                  }

     1084                                  else

     1085                                  {

     1086                                      $dump = 1;

     1087                                      $last_cc = $utf_combining_class[$_utf_char];

     1088                                  }

     1089                              }

     1090                              else

     1091                              {

     1092                                  // This character decomposition contains a starter, dump the buffer and continue

     1093                                  if ($dump)

     1094                                  {

     1095                                      $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1096   

     1097                                      // Dump combiners

     1098                                      if (!empty($utf_sort))

     1099                                      {

     1100                                          if ($sort)

     1101                                          {

     1102                                              ksort($utf_sort);

     1103                                          }

     1104   

     1105                                          foreach ($utf_sort as $utf_chars)

     1106                                          {

     1107                                              $tmp .= implode('', $utf_chars);

     1108                                          }

     1109                                      }

     1110   

     1111                                      $tmp .= $_utf_char;

     1112                                      $dump = $sort = 0;

     1113                                  }

     1114                                  else

     1115                                  {

     1116                                      $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos) . $_utf_char;

     1117                                  }

     1118   

     1119                                  $tmp_pos = $starter_pos = $pos;

     1120                                  $utf_sort = array();

     1121                                  $last_cc = 0;

     1122                              }

     1123                          }

     1124                          else

     1125                          {

     1126                              // This character decomposition contains an ASCII char, which is a starter. Dump the buffer and continue

     1127                              ++$_pos;

     1128   

     1129                              if ($dump)

     1130                              {

     1131                                  $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1132   

     1133                                  // Dump combiners

     1134                                  if (!empty($utf_sort))

     1135                                  {

     1136                                      if ($sort)

     1137                                      {

     1138                                          ksort($utf_sort);

     1139                                      }

     1140   

     1141                                      foreach ($utf_sort as $utf_chars)

     1142                                      {

     1143                                          $tmp .= implode('', $utf_chars);

     1144                                      }

     1145                                  }

     1146   

     1147                                  $tmp .= $c;

     1148                                  $dump = $sort = 0;

     1149                              }

     1150                              else

     1151                              {

     1152                                  $tmp .= substr($str, $tmp_pos, $pos - $utf_len - $tmp_pos) . $c;

     1153                              }

     1154   

     1155                              $tmp_pos = $starter_pos = $pos;

     1156                              $utf_sort = array();

     1157                              $last_cc = 0;

     1158                          }

     1159                      }

     1160                      while ($_pos < $_len);

     1161                  }

     1162                  else if (isset($utf_combining_class[$utf_char]))

     1163                  {

     1164                      // Combining character

     1165                      if ($utf_combining_class[$utf_char] < $last_cc)

     1166                      {

     1167                          // Not in canonical order

     1168                          $sort = $dump = 1;

     1169                      }

     1170                      else

     1171                      {

     1172                          $last_cc = $utf_combining_class[$utf_char];

     1173                      }

     1174   

     1175                      $utf_sort[$utf_combining_class[$utf_char]][] = $utf_char;

     1176                  }

     1177                  else

     1178                  {

     1179                      // Non-decomposable starter, check out if it's a Hangul syllable

     1180                      if ($utf_char < UTF8_HANGUL_FIRST || $utf_char > UTF8_HANGUL_LAST)

     1181                      {

     1182                          // Nope, regular UTF char, check that we have the correct number of trailing bytes

     1183                          if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len])

     1184                          {

     1185                              // Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char

     1186                              // has been encoded in a five- or six- byte sequence.

     1187                              // Move the cursor back to its original position then advance it to the position it should really be at

     1188                              $pos -= $utf_len;

     1189                              $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1190   

     1191                              if (!empty($utf_sort))

     1192                              {

     1193                                  ksort($utf_sort);

     1194   

     1195                                  foreach ($utf_sort as $utf_chars)

     1196                                  {

     1197                                      $tmp .= implode('', $utf_chars);

     1198                                  }

     1199                                  $utf_sort = array();

     1200                              }

     1201   

     1202                              // Add a replacement char then another replacement char for every trailing byte.

     1203                              //

     1204                              // @todo I'm not entirely sure that's how we're supposed to mark invalidated byte sequences, check this

     1205                              $spn = strspn($str, UTF8_TRAILING_BYTES, ++$pos);

     1206                              $tmp .= str_repeat(UTF8_REPLACEMENT, $spn + 1);

     1207   

     1208                              $dump = $sort = 0;

     1209   

     1210                              $pos += $spn;

     1211                              $tmp_pos = $pos;

     1212                              continue;

     1213                          }

     1214   

     1215                          if (isset($extra_check[$utf_char[0]]))

     1216                          {

     1217                              switch ($utf_char[0])

     1218                              {

     1219                                  // Note: 0xED is quite common in Korean

     1220                                  case "\xED":

     1221                                      if ($utf_char >= "\xED\xA0\x80")

     1222                                      {

     1223                                          // Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF)

     1224                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1225   

     1226                                          if (!empty($utf_sort))

     1227                                          {

     1228                                              ksort($utf_sort);

     1229   

     1230                                              foreach ($utf_sort as $utf_chars)

     1231                                              {

     1232                                                  $tmp .= implode('', $utf_chars);

     1233                                              }

     1234                                              $utf_sort = array();

     1235                                          }

     1236   

     1237                                          $tmp .= UTF8_REPLACEMENT;

     1238                                          $dump = $sort = 0;

     1239   

     1240                                          $tmp_pos = $starter_pos = $pos;

     1241                                          continue 2;

     1242                                      }

     1243                                  break;

     1244   

     1245                                  // Note: 0xEF is quite common in Japanese

     1246                                  case "\xEF":

     1247                                      if ($utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF")

     1248                                      {

     1249                                          // U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF)

     1250                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1251   

     1252                                          if (!empty($utf_sort))

     1253                                          {

     1254                                              ksort($utf_sort);

     1255   

     1256                                              foreach ($utf_sort as $utf_chars)

     1257                                              {

     1258                                                  $tmp .= implode('', $utf_chars);

     1259                                              }

     1260                                              $utf_sort = array();

     1261                                          }

     1262   

     1263                                          $tmp .= UTF8_REPLACEMENT;

     1264                                          $dump = $sort = 0;

     1265   

     1266                                          $tmp_pos = $starter_pos = $pos;

     1267                                          continue 2;

     1268                                      }

     1269                                  break;

     1270   

     1271                                  case "\xC0":

     1272                                  case "\xC1":

     1273                                      if ($utf_char <= "\xC1\xBF")

     1274                                      {

     1275                                          // Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char

     1276                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1277   

     1278                                          if (!empty($utf_sort))

     1279                                          {

     1280                                              ksort($utf_sort);

     1281   

     1282                                              foreach ($utf_sort as $utf_chars)

     1283                                              {

     1284                                                  $tmp .= implode('', $utf_chars);

     1285                                              }

     1286                                              $utf_sort = array();

     1287                                          }

     1288   

     1289                                          $tmp .= UTF8_REPLACEMENT;

     1290                                          $dump = $sort = 0;

     1291   

     1292                                          $tmp_pos = $starter_pos = $pos;

     1293                                          continue 2;

     1294                                      }

     1295                                  break;

     1296   

     1297                                  case "\xE0":

     1298                                      if ($utf_char <= "\xE0\x9F\xBF")

     1299                                      {

     1300                                          // Unicode char U+0000..U+07FF encoded in 3 bytes

     1301                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1302   

     1303                                          if (!empty($utf_sort))

     1304                                          {

     1305                                              ksort($utf_sort);

     1306   

     1307                                              foreach ($utf_sort as $utf_chars)

     1308                                              {

     1309                                                  $tmp .= implode('', $utf_chars);

     1310                                              }

     1311                                              $utf_sort = array();

     1312                                          }

     1313   

     1314                                          $tmp .= UTF8_REPLACEMENT;

     1315                                          $dump = $sort = 0;

     1316   

     1317                                          $tmp_pos = $starter_pos = $pos;

     1318                                          continue 2;

     1319                                      }

     1320                                  break;

     1321   

     1322                                  case "\xF0":

     1323                                      if ($utf_char <= "\xF0\x8F\xBF\xBF")

     1324                                      {

     1325                                          // Unicode char U+0000..U+FFFF encoded in 4 bytes

     1326                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1327   

     1328                                          if (!empty($utf_sort))

     1329                                          {

     1330                                              ksort($utf_sort);

     1331   

     1332                                              foreach ($utf_sort as $utf_chars)

     1333                                              {

     1334                                                  $tmp .= implode('', $utf_chars);

     1335                                              }

     1336                                              $utf_sort = array();

     1337                                          }

     1338   

     1339                                          $tmp .= UTF8_REPLACEMENT;

     1340                                          $dump = $sort = 0;

     1341   

     1342                                          $tmp_pos = $starter_pos = $pos;

     1343                                          continue 2;

     1344                                      }

     1345                                  break;

     1346   

     1347                                  default:

     1348                                      if ($utf_char > UTF8_MAX)

     1349                                      {

     1350                                          // Out of the Unicode range

     1351                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1352   

     1353                                          if (!empty($utf_sort))

     1354                                          {

     1355                                              ksort($utf_sort);

     1356   

     1357                                              foreach ($utf_sort as $utf_chars)

     1358                                              {

     1359                                                  $tmp .= implode('', $utf_chars);

     1360                                              }

     1361                                              $utf_sort = array();

     1362                                          }

     1363   

     1364                                          $tmp .= UTF8_REPLACEMENT;

     1365                                          $dump = $sort = 0;

     1366   

     1367                                          $tmp_pos = $starter_pos = $pos;

     1368                                          continue 2;

     1369                                      }

     1370                                  break;

     1371                              }

     1372                          }

     1373                      }

     1374                      else

     1375                      {

     1376                          // Hangul syllable

     1377                          $idx = (((ord($utf_char[0]) & 0x0F) << 12) | ((ord($utf_char[1]) & 0x3F) << 6) | (ord($utf_char[2]) & 0x3F)) - UNICODE_HANGUL_SBASE;

     1378   

     1379                          // LIndex can only range from 0 to 18, therefore it cannot influence the first two bytes of the L Jamo, which allows us to hardcode them (based on LBase).

     1380                          //

     1381                          // The same goes for VIndex, but for TIndex there's a catch: the value of the third byte could exceed 0xBF and we would have to increment the second byte

     1382                          if ($t_index = $idx % UNICODE_HANGUL_TCOUNT)

     1383                          {

     1384                              if ($t_index < 25)

     1385                              {

     1386                                  $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x86\x00";

     1387                                  $utf_char[8] = chr(0xA7 + $t_index);

     1388                              }

     1389                              else

     1390                              {

     1391                                  $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x87\x00";

     1392                                  $utf_char[8] = chr(0x67 + $t_index);

     1393                              }

     1394                          }

     1395                          else

     1396                          {

     1397                              $utf_char = "\xE1\x84\x00\xE1\x85\x00";

     1398                          }

     1399   

     1400                          $utf_char[2] = chr(0x80 + (int) ($idx / UNICODE_HANGUL_NCOUNT));

     1401                          $utf_char[5] = chr(0xA1 + (int) (($idx % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT));

     1402   

     1403                          // Just like other decompositions, the resulting Jamos must be dumped to the tmp string

     1404                          $dump = 1;

     1405                      }

     1406   

     1407                      // Do we need to dump stuff to the tmp string?

     1408                      if ($dump)

     1409                      {

     1410                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1411   

     1412                          // Dump combiners

     1413                          if (!empty($utf_sort))

     1414                          {

     1415                              if ($sort)

     1416                              {

     1417                                  ksort($utf_sort);

     1418                              }

     1419   

     1420                              foreach ($utf_sort as $utf_chars)

     1421                              {

     1422                                  $tmp .= implode('', $utf_chars);

     1423                              }

     1424                          }

     1425   

     1426                          $tmp .= $utf_char;

     1427                          $dump = $sort = 0;

     1428                          $tmp_pos = $pos;

     1429                      }

     1430   

     1431                      $last_cc = 0;

     1432                      $utf_sort = array();

     1433                      $starter_pos = $pos;

     1434                  }

     1435              }

     1436              else

     1437              {

     1438                  // ASCII char, which happens to be a starter (as any other ASCII char)

     1439                  if ($dump)

     1440                  {

     1441                      $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1442   

     1443                      // Dump combiners

     1444                      if (!empty($utf_sort))

     1445                      {

     1446                          if ($sort)

     1447                          {

     1448                              ksort($utf_sort);

     1449                          }

     1450   

     1451                          foreach ($utf_sort as $utf_chars)

     1452                          {

     1453                              $tmp .= implode('', $utf_chars);

     1454                          }

     1455                      }

     1456   

     1457                      $tmp .= $str[$pos];

     1458                      $dump = $sort = 0;

     1459                      $tmp_pos = ++$pos;

     1460   

     1461                      $pos += strspn($str, UTF8_ASCII_RANGE, $pos);

     1462                  }

     1463                  else

     1464                  {

     1465                      $pos += strspn($str, UTF8_ASCII_RANGE, ++$pos);

     1466                  }

     1467   

     1468                  $last_cc = 0;

     1469                  $utf_sort = array();

     1470                  $starter_pos = $pos;

     1471              }

     1472          }

     1473          while ($pos < $len);

     1474   

     1475          // Now is time to return the string

     1476          if ($dump)

     1477          {

     1478              $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1479   

     1480              // Dump combiners

     1481              if (!empty($utf_sort))

     1482              {

     1483                  if ($sort)

     1484                  {

     1485                      ksort($utf_sort);

     1486                  }

     1487   

     1488                  foreach ($utf_sort as $utf_chars)

     1489                  {

     1490                      $tmp .= implode('', $utf_chars);

     1491                  }

     1492              }

     1493   

     1494              return $tmp;

     1495          }

     1496          else if ($tmp_pos)

     1497          {

     1498              // If the $tmp_pos cursor was moved then at least one character was not in normal form. Replace $str with the fixed version

     1499              if ($tmp_pos == $len)

     1500              {

     1501                  // The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str

     1502                  return $tmp;

     1503              }

     1504              else

     1505              {

     1506                  // The rightmost chunk of $str has not been appended to $tmp yet

     1507                  return $tmp . substr($str, $tmp_pos);

     1508              }

     1509          }

     1510   

     1511          // The string was already in normal form

     1512          return $str;

     1513      }

     1514  }

     1515   

     1516  ?>

Verzeichnisstruktur phpBB-3.0.0

Zuletzt modifiziert: 09.10.2024, 12:51 - Dateigröße: 41.73 KiB

utf_normalizer.php


     0001  <?php

     0002  /**

     0003  *

     0004  * @package utf

     0005  * @version $Id$

     0006  * @copyright (c) 2005 phpBB Group

     0007  * @license http://opensource.org/licenses/gpl-license.php GNU Public License

     0008  *

     0009  */

     0010   

     0011  /**

     0012  */

     0013  if (!defined('IN_PHPBB'))

     0014  {

     0015      exit;

     0016  }

     0017   

     0018  /**

     0019  * Some Unicode characters encoded in UTF-8

     0020  *

     0021  * Preserved for compatibility

     0022  */

     0023  define('UTF8_REPLACEMENT', "\xEF\xBF\xBD");

     0024  define('UTF8_MAX', "\xF4\x8F\xBF\xBF");

     0025  define('UTF8_FFFE', "\xEF\xBF\xBE");

     0026  define('UTF8_FFFF', "\xEF\xBF\xBF");

     0027  define('UTF8_SURROGATE_FIRST', "\xED\xA0\x80");

     0028  define('UTF8_SURROGATE_LAST', "\xED\xBF\xBF");

     0029  define('UTF8_HANGUL_FIRST', "\xEA\xB0\x80");

     0030  define('UTF8_HANGUL_LAST', "\xED\x9E\xA3");

     0031   

     0032  define('UTF8_CJK_FIRST', "\xE4\xB8\x80");

     0033  define('UTF8_CJK_LAST', "\xE9\xBE\xBB");

     0034  define('UTF8_CJK_B_FIRST', "\xF0\xA0\x80\x80");

     0035  define('UTF8_CJK_B_LAST', "\xF0\xAA\x9B\x96");

     0036   

     0037  // Unset global variables

     0038  unset($GLOBALS['utf_jamo_index'], $GLOBALS['utf_jamo_type'], $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_combining_class'], $GLOBALS['utf_canonical_comp'], $GLOBALS['utf_canonical_decomp'], $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);

     0039   

     0040  // NFC_QC and NFKC_QC values

     0041  define('UNICODE_QC_MAYBE', 0);

     0042  define('UNICODE_QC_NO', 1);

     0043   

     0044  // Contains all the ASCII characters appearing in UTF-8, sorted by frequency

     0045  define('UTF8_ASCII_RANGE', "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F");

     0046   

     0047  // Contains all the tail bytes that can appear in the composition of a UTF-8 char

     0048  define('UTF8_TRAILING_BYTES', "\xA9\xA0\xA8\x80\xAA\x99\xA7\xBB\xAB\x89\x94\x82\xB4\xA2\xAE\x83\xB0\xB9\xB8\x93\xAF\xBC\xB3\x81\xA4\xB2\x9C\xA1\xB5\xBE\xBD\xBA\x98\xAD\xB1\x84\x95\xA6\xB6\x88\x8D\x90\xB7\xBF\x92\x85\xA5\x97\x8C\x86\xA3\x8E\x9F\x8F\x87\x91\x9D\xAC\x9E\x8B\x96\x9B\x8A\x9A");

     0049   

     0050  // Constants used by the Hangul [de]composition algorithms

     0051  define('UNICODE_HANGUL_SBASE', 0xAC00);

     0052  define('UNICODE_HANGUL_LBASE', 0x1100);

     0053  define('UNICODE_HANGUL_VBASE', 0x1161);

     0054  define('UNICODE_HANGUL_TBASE', 0x11A7);

     0055  define('UNICODE_HANGUL_SCOUNT', 11172);

     0056  define('UNICODE_HANGUL_LCOUNT', 19);

     0057  define('UNICODE_HANGUL_VCOUNT', 21);

     0058  define('UNICODE_HANGUL_TCOUNT', 28);

     0059  define('UNICODE_HANGUL_NCOUNT', 588);

     0060  define('UNICODE_JAMO_L', 0);

     0061  define('UNICODE_JAMO_V', 1);

     0062  define('UNICODE_JAMO_T', 2);

     0063   

     0064  /**

     0065  * Unicode normalization routines

     0066  *

     0067  * @package utf

     0068  */

     0069  class utf_normalizer

     0070  {

     0071      /**

     0072      * Validate, cleanup and normalize a string

     0073      *

     0074      * The ultimate convenience function! Clean up invalid UTF-8 sequences,

     0075      * and convert to Normal Form C, canonical composition.

     0076      *

     0077      * @param    string    &$str    The dirty string

     0078      * @return    string            The same string, all shiny and cleaned-up

     0079      */

     0080      function cleanup(&$str)

     0081      {

     0082          // The string below is the list of all autorized characters, sorted by frequency in latin text

     0083          $pos = strspn($str, "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x0D");

     0084          $len = strlen($str);

     0085   

     0086          if ($pos == $len)

     0087          {

     0088              // ASCII strings with no special chars return immediately

     0089              return;

     0090          }

     0091   

     0092          // Note: we do not check for $GLOBALS['utf_canonical_decomp']. It is assumed they are always loaded together

     0093          if (!isset($GLOBALS['utf_nfc_qc']))

     0094          {

     0095              global $phpbb_root_path, $phpEx;

     0096              include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);

     0097          }

     0098   

     0099          if (!isset($GLOBALS['utf_canonical_decomp']))

     0100          {

     0101              global $phpbb_root_path, $phpEx;

     0102              include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);

     0103          }

     0104   

     0105          // Replace any byte in the range 0x00..0x1F, except for \r, \n and \t

     0106          // We replace those characters with a 0xFF byte, which is illegal in UTF-8 and will in turn be replaced with a UTF replacement char

     0107          $str = strtr(

     0108              $str,

     0109              "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",

     0110              "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"

     0111          );

     0112   

     0113          $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);

     0114      }

     0115   

     0116      /**

     0117      * Validate and normalize a UTF string to NFC

     0118      *

     0119      * @param    string    &$str    Unchecked UTF string

     0120      * @return    string            The string, validated and in normal form

     0121      */

     0122      function nfc(&$str)

     0123      {

     0124          $pos = strspn($str, UTF8_ASCII_RANGE);

     0125          $len = strlen($str);

     0126   

     0127          if ($pos == $len)

     0128          {

     0129              // ASCII strings return immediately

     0130              return;

     0131          }

     0132   

     0133          if (!isset($GLOBALS['utf_nfc_qc']))

     0134          {

     0135              global $phpbb_root_path, $phpEx;

     0136              include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);

     0137          }

     0138   

     0139          if (!isset($GLOBALS['utf_canonical_decomp']))

     0140          {

     0141              global $phpbb_root_path, $phpEx;

     0142              include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);

     0143          }

     0144   

     0145          $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);

     0146      }

     0147   

     0148      /**

     0149      * Validate and normalize a UTF string to NFKC

     0150      *

     0151      * @param    string    &$str    Unchecked UTF string

     0152      * @return    string            The string, validated and in normal form

     0153      */

     0154      function nfkc(&$str)

     0155      {

     0156          $pos = strspn($str, UTF8_ASCII_RANGE);

     0157          $len = strlen($str);

     0158   

     0159          if ($pos == $len)

     0160          {

     0161              // ASCII strings return immediately

     0162              return;

     0163          }

     0164   

     0165          if (!isset($GLOBALS['utf_nfkc_qc']))

     0166          {

     0167              global $phpbb_root_path, $phpEx;

     0168              include($phpbb_root_path . 'includes/utf/data/utf_nfkc_qc.' . $phpEx);

     0169          }

     0170   

     0171          if (!isset($GLOBALS['utf_compatibility_decomp']))

     0172          {

     0173              global $phpbb_root_path, $phpEx;

     0174              include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);

     0175          }

     0176   

     0177          $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);

     0178      }

     0179   

     0180      /**

     0181      * Validate and normalize a UTF string to NFD

     0182      *

     0183      * @param    string    &$str    Unchecked UTF string

     0184      * @return    string            The string, validated and in normal form

     0185      */

     0186      function nfd(&$str)

     0187      {

     0188          $pos = strspn($str, UTF8_ASCII_RANGE);

     0189          $len = strlen($str);

     0190   

     0191          if ($pos == $len)

     0192          {

     0193              // ASCII strings return immediately

     0194              return;

     0195          }

     0196   

     0197          if (!isset($GLOBALS['utf_canonical_decomp']))

     0198          {

     0199              global $phpbb_root_path, $phpEx;

     0200              include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);

     0201          }

     0202   

     0203          $str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_canonical_decomp']);

     0204      }

     0205   

     0206      /**

     0207      * Validate and normalize a UTF string to NFKD

     0208      *

     0209      * @param    string    &$str    Unchecked UTF string

     0210      * @return    string            The string, validated and in normal form

     0211      */

     0212      function nfkd(&$str)

     0213      {

     0214          $pos = strspn($str, UTF8_ASCII_RANGE);

     0215          $len = strlen($str);

     0216   

     0217          if ($pos == $len)

     0218          {

     0219              // ASCII strings return immediately

     0220              return;

     0221          }

     0222   

     0223          if (!isset($GLOBALS['utf_compatibility_decomp']))

     0224          {

     0225              global $phpbb_root_path, $phpEx;

     0226              include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);

     0227          }

     0228   

     0229          $str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_compatibility_decomp']);

     0230      }

     0231   

     0232   

     0233      /**

     0234      * Recompose a UTF string

     0235      *

     0236      * @param    string    $str            Unchecked UTF string

     0237      * @param    integer    $pos            Position of the first UTF char (in bytes)

     0238      * @param    integer    $len            Length of the string (in bytes)

     0239      * @param    array    &$qc            Quick-check array, passed by reference but never modified

     0240      * @param    array    &$decomp_map    Decomposition mapping, passed by reference but never modified

     0241      * @return    string                    The string, validated and recomposed

     0242      *

     0243      * @access    private

     0244      */

     0245      function recompose($str, $pos, $len, &$qc, &$decomp_map)

     0246      {

     0247          global $utf_combining_class, $utf_canonical_comp, $utf_jamo_type, $utf_jamo_index;

     0248   

     0249          // Load some commonly-used tables

     0250          if (!isset($utf_jamo_index, $utf_jamo_type, $utf_combining_class))

     0251          {

     0252              global $phpbb_root_path, $phpEx;

     0253              include($phpbb_root_path . 'includes/utf/data/utf_normalizer_common.' . $phpEx);

     0254          }

     0255   

     0256          // Load the canonical composition table

     0257          if (!isset($utf_canonical_comp))

     0258          {

     0259              global $phpbb_root_path, $phpEx;

     0260              include($phpbb_root_path . 'includes/utf/data/utf_canonical_comp.' . $phpEx);

     0261          }

     0262   

     0263          // Buffer the last ASCII char before the UTF-8 stuff if applicable

     0264          $tmp = '';

     0265          $i = $tmp_pos = $last_cc = 0;

     0266   

     0267          $buffer = ($pos) ? array(++$i => $str[$pos - 1]) : array();

     0268   

     0269          // UTF char length array

     0270          // This array is used to determine the length of a UTF character.

     0271          // Be $c the result of ($str[$pos] & "\xF0") --where $str is the string we're operating on and $pos

     0272          // the position of the cursor--, if $utf_len_mask[$c] does not exist, the byte is an ASCII char.

     0273          // Otherwise, if $utf_len_mask[$c] is greater than 0, we have a the leading byte of a multibyte character

     0274          // whose length is $utf_len_mask[$c] and if it is equal to 0, the byte is a trailing byte.

     0275          $utf_len_mask = array(

     0276              // Leading bytes masks

     0277              "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,

     0278              // Trailing bytes masks

     0279              "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0

     0280          );

     0281   

     0282          $extra_check = array(

     0283              "\xED" => 1, "\xEF" => 1, "\xC0" => 1, "\xC1" => 1, "\xE0" => 1, "\xF0" => 1,

     0284              "\xF4" => 1, "\xF5" => 1, "\xF6" => 1, "\xF7" => 1, "\xF8" => 1, "\xF9" => 1,

     0285              "\xFA" => 1, "\xFB" => 1, "\xFC" => 1, "\xFD" => 1, "\xFE" => 1, "\xFF" => 1

     0286          );

     0287   

     0288          $utf_validation_mask = array(

     0289              2    => "\xE0\xC0",

     0290              3    => "\xF0\xC0\xC0",

     0291              4    => "\xF8\xC0\xC0\xC0"

     0292          );

     0293   

     0294          $utf_validation_check = array(

     0295              2    => "\xC0\x80",

     0296              3    => "\xE0\x80\x80",

     0297              4    => "\xF0\x80\x80\x80"

     0298          );

     0299   

     0300          // Main loop

     0301          do

     0302          {

     0303              // STEP 0: Capture the current char and buffer it

     0304              $c = $str[$pos];

     0305              $c_mask = $c & "\xF0";

     0306   

     0307              if (isset($utf_len_mask[$c_mask]))

     0308              {

     0309                  // Byte at $pos is either a leading byte or a missplaced trailing byte

     0310                  if ($utf_len = $utf_len_mask[$c_mask])

     0311                  {

     0312                      // Capture the char

     0313                      $buffer[++$i & 7] = $utf_char = substr($str, $pos, $utf_len);

     0314   

     0315                      // Let's find out if a thorough check is needed

     0316                      if (isset($qc[$utf_char]))

     0317                      {

     0318                          // If the UTF char is in the qc array then it may not be in normal form. We do nothing here, the actual processing is below this "if" block

     0319                      }

     0320                      else if (isset($utf_combining_class[$utf_char]))

     0321                      {

     0322                          if ($utf_combining_class[$utf_char] < $last_cc)

     0323                          {

     0324                              // A combining character that is NOT canonically ordered

     0325                          }

     0326                          else

     0327                          {

     0328                              // A combining character that IS canonically ordered, skip to the next char

     0329                              $last_cc = $utf_combining_class[$utf_char];

     0330   

     0331                              $pos += $utf_len;

     0332                              continue;

     0333                          }

     0334                      }

     0335                      else

     0336                      {

     0337                          // At this point, $utf_char holds a UTF char that we know is not a NF[K]C_QC and is not a combining character.

     0338                          // It can be a singleton, a canonical composite, a replacement char or an even an ill-formed bunch of bytes. Let's find out

     0339                          $last_cc = 0;

     0340   

     0341                          // Check that we have the correct number of trailing bytes

     0342                          if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len])

     0343                          {

     0344                              // Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char

     0345                              // has been encoded in a five- or six- byte sequence

     0346                              if ($utf_char[0] >= "\xF8")

     0347                              {

     0348                                  if ($utf_char[0] < "\xFC")

     0349                                  {

     0350                                      $trailing_bytes = 4;

     0351                                  }

     0352                                  else if ($utf_char[0] > "\xFD")

     0353                                  {

     0354                                      $trailing_bytes = 0;

     0355                                  }

     0356                                  else

     0357                                  {

     0358                                      $trailing_bytes = 5;

     0359                                  }

     0360                              }

     0361                              else

     0362                              {

     0363                                  $trailing_bytes = $utf_len - 1;

     0364                              }

     0365   

     0366                              $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;

     0367                              $pos += strspn($str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes);

     0368                              $tmp_pos = $pos;

     0369   

     0370                              continue;

     0371                          }

     0372   

     0373                          if (isset($extra_check[$c]))

     0374                          {

     0375                              switch ($c)

     0376                              {

     0377                                  // Note: 0xED is quite common in Korean

     0378                                  case "\xED":

     0379                                      if ($utf_char >= "\xED\xA0\x80")

     0380                                      {

     0381                                          // Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF)

     0382                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;

     0383                                          $pos += $utf_len;

     0384                                          $tmp_pos = $pos;

     0385                                          continue 2;

     0386                                      }

     0387                                  break;

     0388   

     0389                                  // Note: 0xEF is quite common in Japanese

     0390                                  case "\xEF":

     0391                                      if ($utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF")

     0392                                      {

     0393                                          // U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF)

     0394                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;

     0395                                          $pos += $utf_len;

     0396                                          $tmp_pos = $pos;

     0397                                          continue 2;

     0398                                      }

     0399                                  break;

     0400   

     0401                                  case "\xC0":

     0402                                  case "\xC1":

     0403                                      if ($utf_char <= "\xC1\xBF")

     0404                                      {

     0405                                          // Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char

     0406                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;

     0407                                          $pos += $utf_len;

     0408                                          $tmp_pos = $pos;

     0409                                          continue 2;

     0410                                      }

     0411                                  break;

     0412   

     0413                                  case "\xE0":

     0414                                      if ($utf_char <= "\xE0\x9F\xBF")

     0415                                      {

     0416                                          // Unicode char U+0000..U+07FF encoded in 3 bytes

     0417                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;

     0418                                          $pos += $utf_len;

     0419                                          $tmp_pos = $pos;

     0420                                          continue 2;

     0421                                      }

     0422                                  break;

     0423   

     0424                                  case "\xF0":

     0425                                      if ($utf_char <= "\xF0\x8F\xBF\xBF")

     0426                                      {

     0427                                          // Unicode char U+0000..U+FFFF encoded in 4 bytes

     0428                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;

     0429                                          $pos += $utf_len;

     0430                                          $tmp_pos = $pos;

     0431                                          continue 2;

     0432                                      }

     0433                                  break;

     0434   

     0435                                  default:

     0436                                      // Five- and six- byte sequences do not need being checked for here anymore

     0437                                      if ($utf_char > UTF8_MAX)

     0438                                      {

     0439                                          // Out of the Unicode range

     0440                                          if ($utf_char[0] < "\xF8")

     0441                                          {

     0442                                              $trailing_bytes = 3;

     0443                                          }

     0444                                          else if ($utf_char[0] < "\xFC")

     0445                                          {

     0446                                              $trailing_bytes = 4;

     0447                                          }

     0448                                          else if ($utf_char[0] > "\xFD")

     0449                                          {

     0450                                              $trailing_bytes = 0;

     0451                                          }

     0452                                          else

     0453                                          {

     0454                                              $trailing_bytes = 5;

     0455                                          }

     0456   

     0457                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;

     0458                                          $pos += strspn($str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes);

     0459                                          $tmp_pos = $pos;

     0460                                          continue 2;

     0461                                      }

     0462                                  break;

     0463                              }

     0464                          }

     0465   

     0466                          // The char is a valid starter, move the cursor and go on

     0467                          $pos += $utf_len;

     0468                          continue;

     0469                      }

     0470                  }

     0471                  else

     0472                  {

     0473                      // A trailing byte came out of nowhere, we will advance the cursor and treat the this byte and all following trailing bytes as if

     0474                      // each of them was a Unicode replacement char

     0475                      $spn = strspn($str, UTF8_TRAILING_BYTES, $pos);

     0476                      $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn);

     0477   

     0478                      $pos += $spn;

     0479                      $tmp_pos = $pos;

     0480                      continue;

     0481                  }

     0482   

     0483   

     0484                  // STEP 1: Decompose current char

     0485   

     0486                  // We have found a character that is either:

     0487                  //  - in the NFC_QC/NFKC_QC list

     0488                  //  - a non-starter char that is not canonically ordered

     0489                  //

     0490                  // We are going to capture the shortest UTF sequence that satisfies these two conditions:

     0491                  //

     0492                  //  1 - If the sequence does not start at the begginning of the string, it must begin with a starter,

     0493                  // and that starter must not have the NF[K]C_QC property equal to "MAYBE"

     0494                  //

     0495                  //  2 - If the sequence does not end at the end of the string, it must end with a non-starter and be

     0496                  // immediately followed by a starter that is not on the QC list

     0497                  //

     0498                  $utf_seq = array();

     0499                  $last_cc = 0;

     0500                  $lpos = $pos;

     0501                  $pos += $utf_len;

     0502   

     0503                  if (isset($decomp_map[$utf_char]))

     0504                  {

     0505                      $_pos = 0;

     0506                      $_len = strlen($decomp_map[$utf_char]);

     0507   

     0508                      do

     0509                      {

     0510                          $_utf_len =& $utf_len_mask[$decomp_map[$utf_char][$_pos] & "\xF0"];

     0511   

     0512                          if (isset($_utf_len))

     0513                          {

     0514                              $utf_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);

     0515                              $_pos += $_utf_len;

     0516                          }

     0517                          else

     0518                          {

     0519                              $utf_seq[] = $decomp_map[$utf_char][$_pos];

     0520                              ++$_pos;

     0521                          }

     0522                      }

     0523                      while ($_pos < $_len);

     0524                  }

     0525                  else

     0526                  {

     0527                      // The char is not decomposable

     0528                      $utf_seq = array($utf_char);

     0529                  }

     0530   

     0531   

     0532                  // STEP 2: Capture the starter

     0533   

     0534                  // Check out the combining class of the first character of the UTF sequence

     0535                  $k = 0;

     0536                  if (isset($utf_combining_class[$utf_seq[0]]) || $qc[$utf_char] == UNICODE_QC_MAYBE)

     0537                  {

     0538                      // Not a starter, inspect previous characters

     0539                      // The last 8 characters are kept in a buffer so that we don't have to capture them everytime.

     0540                      // This is enough for all real-life strings but even if it wasn't, we can capture characters in backward mode,

     0541                      // although it is slower than this method.

     0542                      //

     0543                      // In the following loop, $j starts at the previous buffered character ($i - 1, because current character is

     0544                      // at offset $i) and process them in backward mode until we find a starter.

     0545                      //

     0546                      // $k is the index on each UTF character inside of our UTF sequence. At this time, $utf_seq contains one or more

     0547                      // characters numbered 0 to n. $k starts at 0 and for each char we prepend we pre-decrement it and for numbering

     0548                      $starter_found = 0;

     0549                      $j_min = max(1, $i - 7);

     0550   

     0551                      for ($j = $i - 1; $j >= $j_min && $lpos > $tmp_pos; --$j)

     0552                      {

     0553                          $utf_char = $buffer[$j & 7];

     0554                          $lpos -= strlen($utf_char);

     0555   

     0556                          if (isset($decomp_map[$utf_char]))

     0557                          {

     0558                              // The char is a composite, decompose for storage

     0559                              $decomp_seq = array();

     0560                              $_pos = 0;

     0561                              $_len = strlen($decomp_map[$utf_char]);

     0562   

     0563                              do

     0564                              {

     0565                                  $c = $decomp_map[$utf_char][$_pos];

     0566                                  $_utf_len =& $utf_len_mask[$c & "\xF0"];

     0567   

     0568                                  if (isset($_utf_len))

     0569                                  {

     0570                                      $decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);

     0571                                      $_pos += $_utf_len;

     0572                                  }

     0573                                  else

     0574                                  {

     0575                                      $decomp_seq[] = $c;

     0576                                      ++$_pos;

     0577                                  }

     0578                              }

     0579                              while ($_pos < $_len);

     0580   

     0581                              // Prepend the UTF sequence with our decomposed sequence

     0582                              if (isset($decomp_seq[1]))

     0583                              {

     0584                                  // The char expanded into several chars

     0585                                  $decomp_cnt = sizeof($decomp_seq);

     0586   

     0587                                  foreach ($decomp_seq as $decomp_i => $decomp_char)

     0588                                  {

     0589                                      $utf_seq[$k + $decomp_i - $decomp_cnt] = $decomp_char;

     0590                                  }

     0591                                  $k -= $decomp_cnt;

     0592                              }

     0593                              else

     0594                              {

     0595                                  // Decomposed to a single char, easier to prepend

     0596                                  $utf_seq[--$k] = $decomp_seq[0];

     0597                              }

     0598                          }

     0599                          else

     0600                          {

     0601                              $utf_seq[--$k] = $utf_char;

     0602                          }

     0603   

     0604                          if (!isset($utf_combining_class[$utf_seq[$k]]))

     0605                          {

     0606                              // We have found our starter

     0607                              $starter_found = 1;

     0608                              break;

     0609                          }

     0610                      }

     0611   

     0612                      if (!$starter_found && $lpos > $tmp_pos)

     0613                      {

     0614                          // The starter was not found in the buffer, let's rewind some more

     0615                          do

     0616                          {

     0617                              // $utf_len_mask contains the masks of both leading bytes and trailing bytes. If $utf_en > 0 then it's a leading byte, otherwise it's a trailing byte.

     0618                              $c = $str[--$lpos];

     0619                              $c_mask = $c & "\xF0";

     0620   

     0621                              if (isset($utf_len_mask[$c_mask]))

     0622                              {

     0623                                  // UTF byte

     0624                                  if ($utf_len = $utf_len_mask[$c_mask])

     0625                                  {

     0626                                      // UTF *leading* byte

     0627                                      $utf_char = substr($str, $lpos, $utf_len);

     0628   

     0629                                      if (isset($decomp_map[$utf_char]))

     0630                                      {

     0631                                          // Decompose the character

     0632                                          $decomp_seq = array();

     0633                                          $_pos = 0;

     0634                                          $_len = strlen($decomp_map[$utf_char]);

     0635   

     0636                                          do

     0637                                          {

     0638                                              $c = $decomp_map[$utf_char][$_pos];

     0639                                              $_utf_len =& $utf_len_mask[$c & "\xF0"];

     0640   

     0641                                              if (isset($_utf_len))

     0642                                              {

     0643                                                  $decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);

     0644                                                  $_pos += $_utf_len;

     0645                                              }

     0646                                              else

     0647                                              {

     0648                                                  $decomp_seq[] = $c;

     0649                                                  ++$_pos;

     0650                                              }

     0651                                          }

     0652                                          while ($_pos < $_len);

     0653   

     0654                                          // Prepend the UTF sequence with our decomposed sequence

     0655                                          if (isset($decomp_seq[1]))

     0656                                          {

     0657                                              // The char expanded into several chars

     0658                                              $decomp_cnt = sizeof($decomp_seq);

     0659                                              foreach ($decomp_seq as $decomp_i => $utf_char)

     0660                                              {

     0661                                                  $utf_seq[$k + $decomp_i - $decomp_cnt] = $utf_char;

     0662                                              }

     0663                                              $k -= $decomp_cnt;

     0664                                          }

     0665                                          else

     0666                                          {

     0667                                              // Decomposed to a single char, easier to prepend

     0668                                              $utf_seq[--$k] = $decomp_seq[0];

     0669                                          }

     0670                                      }

     0671                                      else

     0672                                      {

     0673                                          $utf_seq[--$k] = $utf_char;

     0674                                      }

     0675                                  }

     0676                              }

     0677                              else

     0678                              {

     0679                                  // ASCII char

     0680                                  $utf_seq[--$k] = $c;

     0681                              }

     0682                          }

     0683                          while ($lpos > $tmp_pos);

     0684                      }

     0685                  }

     0686   

     0687   

     0688                  // STEP 3: Capture following combining modifiers

     0689   

     0690                  while ($pos < $len)

     0691                  {

     0692                      $c_mask = $str[$pos] & "\xF0";

     0693   

     0694                      if (isset($utf_len_mask[$c_mask]))

     0695                      {

     0696                          if ($utf_len = $utf_len_mask[$c_mask])

     0697                          {

     0698                              $utf_char = substr($str, $pos, $utf_len);

     0699                          }

     0700                          else

     0701                          {

     0702                              // A trailing byte came out of nowhere

     0703                              // Trailing bytes are replaced with Unicode replacement chars, we will just ignore it for now, break out of the loop

     0704                              // as if it was a starter (replacement chars ARE starters) and let the next loop replace it

     0705                              break;

     0706                          }

     0707   

     0708                          if (isset($utf_combining_class[$utf_char]) || isset($qc[$utf_char]))

     0709                          {

     0710                              // Combining character, add it to the sequence and move the cursor

     0711                              if (isset($decomp_map[$utf_char]))

     0712                              {

     0713                                  // Decompose the character

     0714                                  $_pos = 0;

     0715                                  $_len = strlen($decomp_map[$utf_char]);

     0716   

     0717                                  do

     0718                                  {

     0719                                      $c = $decomp_map[$utf_char][$_pos];

     0720                                      $_utf_len =& $utf_len_mask[$c & "\xF0"];

     0721   

     0722                                      if (isset($_utf_len))

     0723                                      {

     0724                                          $utf_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);

     0725                                          $_pos += $_utf_len;

     0726                                      }

     0727                                      else

     0728                                      {

     0729                                          $utf_seq[] = $c;

     0730                                          ++$_pos;

     0731                                      }

     0732                                  }

     0733                                  while ($_pos < $_len);

     0734                              }

     0735                              else

     0736                              {

     0737                                  $utf_seq[] = $utf_char;

     0738                              }

     0739   

     0740                              $pos += $utf_len;

     0741                          }

     0742                          else

     0743                          {

     0744                              // Combining class 0 and no QC, break out of the loop

     0745                              // Note: we do not know if that character is valid. If it's not, the next iteration will replace it

     0746                              break;

     0747                          }

     0748                      }

     0749                      else

     0750                      {

     0751                          // ASCII chars are starters

     0752                          break;

     0753                      }

     0754                  }

     0755   

     0756   

     0757                  // STEP 4: Sort and combine

     0758   

     0759                  // Here we sort...

     0760                  $k_max = $k + sizeof($utf_seq);

     0761   

     0762                  if (!$k && $k_max == 1)

     0763                  {

     0764                      // There is only one char in the UTF sequence, add it then jump to the next iteration of main loop

     0765                          // Note: the two commented lines below can be enabled under PHP5 for a very small performance gain in most cases

     0766  //                        if (substr_compare($str, $utf_seq[0], $lpos, $pos - $lpos))

     0767  //                        {

     0768                          $tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $utf_seq[0];

     0769                          $tmp_pos = $pos;

     0770  //                        }

     0771   

     0772                      continue;

     0773                  }

     0774   

     0775                  // ...there we combine

     0776                  if (isset($utf_combining_class[$utf_seq[$k]]))

     0777                  {

     0778                      $starter = $nf_seq = '';

     0779                  }

     0780                  else

     0781                  {

     0782                      $starter = $utf_seq[$k++];

     0783                      $nf_seq = '';

     0784                  }

     0785                  $utf_sort = array();

     0786   

     0787                  // We add an empty char at the end of the UTF char sequence. It will act as a starter and trigger the sort/combine routine

     0788                  // at the end of the string without altering it

     0789                  $utf_seq[] = '';

     0790   

     0791                  do

     0792                  {

     0793                      $utf_char = $utf_seq[$k++];

     0794   

     0795                      if (isset($utf_combining_class[$utf_char]))

     0796                      {

     0797                          $utf_sort[$utf_combining_class[$utf_char]][] = $utf_char;

     0798                      }

     0799                      else

     0800                      {

     0801                          if (empty($utf_sort))

     0802                          {

     0803                              // No combining characters... check for a composite of the two starters

     0804                              if (isset($utf_canonical_comp[$starter . $utf_char]))

     0805                              {

     0806                                  // Good ol' composite character

     0807                                  $starter = $utf_canonical_comp[$starter . $utf_char];

     0808                              }

     0809                              else if (isset($utf_jamo_type[$utf_char]))

     0810                              {

     0811                                  // Current char is a composable jamo

     0812                                  if (isset($utf_jamo_type[$starter]) && $utf_jamo_type[$starter] == UNICODE_JAMO_L && $utf_jamo_type[$utf_char] == UNICODE_JAMO_V)

     0813                                  {

     0814                                      // We have a L jamo followed by a V jamo, we are going to prefetch the next char to see if it's a T jamo

     0815                                      if (isset($utf_jamo_type[$utf_seq[$k]]) && $utf_jamo_type[$utf_seq[$k]] == UNICODE_JAMO_T)

     0816                                      {

     0817                                          // L+V+T jamos, combine to a LVT Hangul syllable ($k is incremented)

     0818                                          $cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char] + $utf_jamo_index[$utf_seq[$k]];

     0819                                          ++$k;

     0820                                      }

     0821                                      else

     0822                                      {

     0823                                          // L+V jamos, combine to a LV Hangul syllable

     0824                                          $cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char];

     0825                                      }

     0826   

     0827                                      $starter = chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));

     0828                                  }

     0829                                  else

     0830                                  {

     0831                                      // Non-composable jamo, just add it to the sequence

     0832                                      $nf_seq .= $starter;

     0833                                      $starter = $utf_char;

     0834                                  }

     0835                              }

     0836                              else

     0837                              {

     0838                                  // No composite, just add the first starter to the sequence then continue with the other one

     0839                                  $nf_seq .= $starter;

     0840                                  $starter = $utf_char;

     0841                              }

     0842                          }

     0843                          else

     0844                          {

     0845                              ksort($utf_sort);

     0846   

     0847                              // For each class of combining characters

     0848                              foreach ($utf_sort as $cc => $utf_chars)

     0849                              {

     0850                                  $j = 0;

     0851   

     0852                                  do

     0853                                  {

     0854                                      // Look for a composite

     0855                                      if (isset($utf_canonical_comp[$starter . $utf_chars[$j]]))

     0856                                      {

     0857                                          // Found a composite, replace the starter

     0858                                          $starter = $utf_canonical_comp[$starter . $utf_chars[$j]];

     0859                                          unset($utf_sort[$cc][$j]);

     0860                                      }

     0861                                      else

     0862                                      {

     0863                                          // No composite, all following characters in that class are blocked

     0864                                          break;

     0865                                      }

     0866                                  }

     0867                                  while (isset($utf_sort[$cc][++$j]));

     0868                              }

     0869   

     0870                              // Add the starter to the normalized sequence, followed by non-starters in canonical order

     0871                              $nf_seq .= $starter;

     0872   

     0873                              foreach ($utf_sort as $utf_chars)

     0874                              {

     0875                                  if (!empty($utf_chars))

     0876                                  {

     0877                                      $nf_seq .= implode('', $utf_chars);

     0878                                  }

     0879                              }

     0880   

     0881                              // Reset the array and go on

     0882                              $utf_sort = array();

     0883                              $starter = $utf_char;

     0884                          }

     0885                      }

     0886                  }

     0887                  while ($k <= $k_max);

     0888   

     0889                  $tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $nf_seq;

     0890                  $tmp_pos = $pos;

     0891              }

     0892              else

     0893              {

     0894                  // Only a ASCII char can make the program get here

     0895                  //

     0896                  // First we skip the current byte with ++$pos, then we quickly skip following ASCII chars with strspn().

     0897                  //

     0898                  // The first two "if"'s here can be removed, with the consequences of being faster on latin text (lots of ASCII) and slower on

     0899                  // multi-byte text (where the only ASCII chars are spaces and punctuation)

     0900                  if (++$pos != $len)

     0901                  {

     0902                      if ($str[$pos] < "\x80")

     0903                      {

     0904                          $pos += strspn($str, UTF8_ASCII_RANGE, ++$pos);

     0905                          $buffer[++$i & 7] = $str[$pos - 1];

     0906                      }

     0907                      else

     0908                      {

     0909                          $buffer[++$i & 7] = $c;

     0910                      }

     0911                  }

     0912              }

     0913          }

     0914          while ($pos < $len);

     0915   

     0916          // Now is time to return the string

     0917          if ($tmp_pos)

     0918          {

     0919              // If the $tmp_pos cursor is not at the beggining of the string then at least one character was not in normal form. Replace $str with the fixed version

     0920              if ($tmp_pos == $len)

     0921              {

     0922                  // The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str

     0923                  return $tmp;

     0924              }

     0925              else

     0926              {

     0927                  // The rightmost chunk of $str has not been appended to $tmp yet

     0928                  return $tmp . substr($str, $tmp_pos);

     0929              }

     0930          }

     0931   

     0932          // The string was already in normal form

     0933          return $str;

     0934      }

     0935   

     0936      /**

     0937      * Decompose a UTF string

     0938      *

     0939      * @param    string    $str            UTF string

     0940      * @param    integer    $pos            Position of the first UTF char (in bytes)

     0941      * @param    integer    $len            Length of the string (in bytes)

     0942      * @param    array    &$decomp_map    Decomposition mapping, passed by reference but never modified

     0943      * @return    string                    The string, decomposed and sorted canonically

     0944      *

     0945      * @access    private

     0946      */

     0947      function decompose($str, $pos, $len, &$decomp_map)

     0948      {

     0949          global $utf_combining_class;

     0950   

     0951          // Load some commonly-used tables

     0952          if (!isset($utf_combining_class))

     0953          {

     0954              global $phpbb_root_path, $phpEx;

     0955              include($phpbb_root_path . 'includes/utf/data/utf_normalizer_common.' . $phpEx);

     0956          }

     0957   

     0958          // UTF char length array

     0959          $utf_len_mask = array(

     0960              // Leading bytes masks

     0961              "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,

     0962              // Trailing bytes masks

     0963              "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0

     0964          );

     0965   

     0966          // Some extra checks are triggered on the first byte of a UTF sequence

     0967          $extra_check = array(

     0968              "\xED" => 1, "\xEF" => 1, "\xC0" => 1, "\xC1" => 1, "\xE0" => 1, "\xF0" => 1,

     0969              "\xF4" => 1, "\xF5" => 1, "\xF6" => 1, "\xF7" => 1, "\xF8" => 1, "\xF9" => 1,

     0970              "\xFA" => 1, "\xFB" => 1, "\xFC" => 1, "\xFD" => 1, "\xFE" => 1, "\xFF" => 1

     0971          );

     0972   

     0973          // These masks are used to check if a UTF sequence is well formed. Here are the only 3 lengths we acknowledge:

     0974          //   - 2-byte: 110? ???? 10?? ????

     0975          //   - 3-byte: 1110 ???? 10?? ???? 10?? ????

     0976          //   - 4-byte: 1111 0??? 10?? ???? 10?? ???? 10?? ????

     0977          // Note that 5- and 6- byte sequences are automatically discarded

     0978          $utf_validation_mask = array(

     0979              2    => "\xE0\xC0",

     0980              3    => "\xF0\xC0\xC0",

     0981              4    => "\xF8\xC0\xC0\xC0"

     0982          );

     0983   

     0984          $utf_validation_check = array(

     0985              2    => "\xC0\x80",

     0986              3    => "\xE0\x80\x80",

     0987              4    => "\xF0\x80\x80\x80"

     0988          );

     0989   

     0990          $tmp = '';

     0991          $starter_pos = $pos;

     0992          $tmp_pos = $last_cc = $sort = $dump = 0;

     0993          $utf_sort = array();

     0994   

     0995   

     0996          // Main loop

     0997          do

     0998          {

     0999              // STEP 0: Capture the current char

     1000   

     1001              $cur_mask = $str[$pos] & "\xF0";

     1002              if (isset($utf_len_mask[$cur_mask]))

     1003              {

     1004                  if ($utf_len = $utf_len_mask[$cur_mask])

     1005                  {

     1006                      // Multibyte char

     1007                      $utf_char = substr($str, $pos, $utf_len);

     1008                      $pos += $utf_len;

     1009                  }

     1010                  else

     1011                  {

     1012                      // A trailing byte came out of nowhere, we will treat it and all following trailing bytes as if each of them was a Unicode

     1013                      // replacement char and we will advance the cursor

     1014                      $spn = strspn($str, UTF8_TRAILING_BYTES, $pos);

     1015   

     1016                      if ($dump)

     1017                      {

     1018                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1019   

     1020                          // Dump combiners

     1021                          if (!empty($utf_sort))

     1022                          {

     1023                              if ($sort)

     1024                              {

     1025                                  ksort($utf_sort);

     1026                              }

     1027   

     1028                              foreach ($utf_sort as $utf_chars)

     1029                              {

     1030                                  $tmp .= implode('', $utf_chars);

     1031                              }

     1032                          }

     1033   

     1034                          $tmp .= str_repeat(UTF8_REPLACEMENT, $spn);

     1035                          $dump = $sort = 0;

     1036                      }

     1037                      else

     1038                      {

     1039                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn);

     1040                      }

     1041   

     1042                      $pos += $spn;

     1043                      $tmp_pos = $starter_pos = $pos;

     1044   

     1045                      $utf_sort = array();

     1046                      $last_cc = 0;

     1047   

     1048                      continue;

     1049                  }

     1050   

     1051   

     1052                  // STEP 1: Decide what to do with current char

     1053   

     1054                  // Now, in that order:

     1055                  //  - check if that character is decomposable

     1056                  //  - check if that character is a non-starter

     1057                  //  - check if that character requires extra checks to be performed

     1058                  if (isset($decomp_map[$utf_char]))

     1059                  {

     1060                      // Decompose the char

     1061                      $_pos = 0;

     1062                      $_len = strlen($decomp_map[$utf_char]);

     1063   

     1064                      do

     1065                      {

     1066                          $c = $decomp_map[$utf_char][$_pos];

     1067                          $_utf_len =& $utf_len_mask[$c & "\xF0"];

     1068   

     1069                          if (isset($_utf_len))

     1070                          {

     1071                              $_utf_char = substr($decomp_map[$utf_char], $_pos, $_utf_len);

     1072                              $_pos += $_utf_len;

     1073   

     1074                              if (isset($utf_combining_class[$_utf_char]))

     1075                              {

     1076                                  // The character decomposed to a non-starter, buffer it for sorting

     1077                                  $utf_sort[$utf_combining_class[$_utf_char]][] = $_utf_char;

     1078   

     1079                                  if ($utf_combining_class[$_utf_char] < $last_cc)

     1080                                  {

     1081                                      // Not canonically ordered, will require sorting

     1082                                      $sort = $dump = 1;

     1083                                  }

     1084                                  else

     1085                                  {

     1086                                      $dump = 1;

     1087                                      $last_cc = $utf_combining_class[$_utf_char];

     1088                                  }

     1089                              }

     1090                              else

     1091                              {

     1092                                  // This character decomposition contains a starter, dump the buffer and continue

     1093                                  if ($dump)

     1094                                  {

     1095                                      $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1096   

     1097                                      // Dump combiners

     1098                                      if (!empty($utf_sort))

     1099                                      {

     1100                                          if ($sort)

     1101                                          {

     1102                                              ksort($utf_sort);

     1103                                          }

     1104   

     1105                                          foreach ($utf_sort as $utf_chars)

     1106                                          {

     1107                                              $tmp .= implode('', $utf_chars);

     1108                                          }

     1109                                      }

     1110   

     1111                                      $tmp .= $_utf_char;

     1112                                      $dump = $sort = 0;

     1113                                  }

     1114                                  else

     1115                                  {

     1116                                      $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos) . $_utf_char;

     1117                                  }

     1118   

     1119                                  $tmp_pos = $starter_pos = $pos;

     1120                                  $utf_sort = array();

     1121                                  $last_cc = 0;

     1122                              }

     1123                          }

     1124                          else

     1125                          {

     1126                              // This character decomposition contains an ASCII char, which is a starter. Dump the buffer and continue

     1127                              ++$_pos;

     1128   

     1129                              if ($dump)

     1130                              {

     1131                                  $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1132   

     1133                                  // Dump combiners

     1134                                  if (!empty($utf_sort))

     1135                                  {

     1136                                      if ($sort)

     1137                                      {

     1138                                          ksort($utf_sort);

     1139                                      }

     1140   

     1141                                      foreach ($utf_sort as $utf_chars)

     1142                                      {

     1143                                          $tmp .= implode('', $utf_chars);

     1144                                      }

     1145                                  }

     1146   

     1147                                  $tmp .= $c;

     1148                                  $dump = $sort = 0;

     1149                              }

     1150                              else

     1151                              {

     1152                                  $tmp .= substr($str, $tmp_pos, $pos - $utf_len - $tmp_pos) . $c;

     1153                              }

     1154   

     1155                              $tmp_pos = $starter_pos = $pos;

     1156                              $utf_sort = array();

     1157                              $last_cc = 0;

     1158                          }

     1159                      }

     1160                      while ($_pos < $_len);

     1161                  }

     1162                  else if (isset($utf_combining_class[$utf_char]))

     1163                  {

     1164                      // Combining character

     1165                      if ($utf_combining_class[$utf_char] < $last_cc)

     1166                      {

     1167                          // Not in canonical order

     1168                          $sort = $dump = 1;

     1169                      }

     1170                      else

     1171                      {

     1172                          $last_cc = $utf_combining_class[$utf_char];

     1173                      }

     1174   

     1175                      $utf_sort[$utf_combining_class[$utf_char]][] = $utf_char;

     1176                  }

     1177                  else

     1178                  {

     1179                      // Non-decomposable starter, check out if it's a Hangul syllable

     1180                      if ($utf_char < UTF8_HANGUL_FIRST || $utf_char > UTF8_HANGUL_LAST)

     1181                      {

     1182                          // Nope, regular UTF char, check that we have the correct number of trailing bytes

     1183                          if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len])

     1184                          {

     1185                              // Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char

     1186                              // has been encoded in a five- or six- byte sequence.

     1187                              // Move the cursor back to its original position then advance it to the position it should really be at

     1188                              $pos -= $utf_len;

     1189                              $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1190   

     1191                              if (!empty($utf_sort))

     1192                              {

     1193                                  ksort($utf_sort);

     1194   

     1195                                  foreach ($utf_sort as $utf_chars)

     1196                                  {

     1197                                      $tmp .= implode('', $utf_chars);

     1198                                  }

     1199                                  $utf_sort = array();

     1200                              }

     1201   

     1202                              // Add a replacement char then another replacement char for every trailing byte.

     1203                              //

     1204                              // @todo I'm not entirely sure that's how we're supposed to mark invalidated byte sequences, check this

     1205                              $spn = strspn($str, UTF8_TRAILING_BYTES, ++$pos);

     1206                              $tmp .= str_repeat(UTF8_REPLACEMENT, $spn + 1);

     1207   

     1208                              $dump = $sort = 0;

     1209   

     1210                              $pos += $spn;

     1211                              $tmp_pos = $pos;

     1212                              continue;

     1213                          }

     1214   

     1215                          if (isset($extra_check[$utf_char[0]]))

     1216                          {

     1217                              switch ($utf_char[0])

     1218                              {

     1219                                  // Note: 0xED is quite common in Korean

     1220                                  case "\xED":

     1221                                      if ($utf_char >= "\xED\xA0\x80")

     1222                                      {

     1223                                          // Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF)

     1224                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1225   

     1226                                          if (!empty($utf_sort))

     1227                                          {

     1228                                              ksort($utf_sort);

     1229   

     1230                                              foreach ($utf_sort as $utf_chars)

     1231                                              {

     1232                                                  $tmp .= implode('', $utf_chars);

     1233                                              }

     1234                                              $utf_sort = array();

     1235                                          }

     1236   

     1237                                          $tmp .= UTF8_REPLACEMENT;

     1238                                          $dump = $sort = 0;

     1239   

     1240                                          $tmp_pos = $starter_pos = $pos;

     1241                                          continue 2;

     1242                                      }

     1243                                  break;

     1244   

     1245                                  // Note: 0xEF is quite common in Japanese

     1246                                  case "\xEF":

     1247                                      if ($utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF")

     1248                                      {

     1249                                          // U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF)

     1250                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1251   

     1252                                          if (!empty($utf_sort))

     1253                                          {

     1254                                              ksort($utf_sort);

     1255   

     1256                                              foreach ($utf_sort as $utf_chars)

     1257                                              {

     1258                                                  $tmp .= implode('', $utf_chars);

     1259                                              }

     1260                                              $utf_sort = array();

     1261                                          }

     1262   

     1263                                          $tmp .= UTF8_REPLACEMENT;

     1264                                          $dump = $sort = 0;

     1265   

     1266                                          $tmp_pos = $starter_pos = $pos;

     1267                                          continue 2;

     1268                                      }

     1269                                  break;

     1270   

     1271                                  case "\xC0":

     1272                                  case "\xC1":

     1273                                      if ($utf_char <= "\xC1\xBF")

     1274                                      {

     1275                                          // Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char

     1276                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1277   

     1278                                          if (!empty($utf_sort))

     1279                                          {

     1280                                              ksort($utf_sort);

     1281   

     1282                                              foreach ($utf_sort as $utf_chars)

     1283                                              {

     1284                                                  $tmp .= implode('', $utf_chars);

     1285                                              }

     1286                                              $utf_sort = array();

     1287                                          }

     1288   

     1289                                          $tmp .= UTF8_REPLACEMENT;

     1290                                          $dump = $sort = 0;

     1291   

     1292                                          $tmp_pos = $starter_pos = $pos;

     1293                                          continue 2;

     1294                                      }

     1295                                  break;

     1296   

     1297                                  case "\xE0":

     1298                                      if ($utf_char <= "\xE0\x9F\xBF")

     1299                                      {

     1300                                          // Unicode char U+0000..U+07FF encoded in 3 bytes

     1301                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1302   

     1303                                          if (!empty($utf_sort))

     1304                                          {

     1305                                              ksort($utf_sort);

     1306   

     1307                                              foreach ($utf_sort as $utf_chars)

     1308                                              {

     1309                                                  $tmp .= implode('', $utf_chars);

     1310                                              }

     1311                                              $utf_sort = array();

     1312                                          }

     1313   

     1314                                          $tmp .= UTF8_REPLACEMENT;

     1315                                          $dump = $sort = 0;

     1316   

     1317                                          $tmp_pos = $starter_pos = $pos;

     1318                                          continue 2;

     1319                                      }

     1320                                  break;

     1321   

     1322                                  case "\xF0":

     1323                                      if ($utf_char <= "\xF0\x8F\xBF\xBF")

     1324                                      {

     1325                                          // Unicode char U+0000..U+FFFF encoded in 4 bytes

     1326                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1327   

     1328                                          if (!empty($utf_sort))

     1329                                          {

     1330                                              ksort($utf_sort);

     1331   

     1332                                              foreach ($utf_sort as $utf_chars)

     1333                                              {

     1334                                                  $tmp .= implode('', $utf_chars);

     1335                                              }

     1336                                              $utf_sort = array();

     1337                                          }

     1338   

     1339                                          $tmp .= UTF8_REPLACEMENT;

     1340                                          $dump = $sort = 0;

     1341   

     1342                                          $tmp_pos = $starter_pos = $pos;

     1343                                          continue 2;

     1344                                      }

     1345                                  break;

     1346   

     1347                                  default:

     1348                                      if ($utf_char > UTF8_MAX)

     1349                                      {

     1350                                          // Out of the Unicode range

     1351                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1352   

     1353                                          if (!empty($utf_sort))

     1354                                          {

     1355                                              ksort($utf_sort);

     1356   

     1357                                              foreach ($utf_sort as $utf_chars)

     1358                                              {

     1359                                                  $tmp .= implode('', $utf_chars);

     1360                                              }

     1361                                              $utf_sort = array();

     1362                                          }

     1363   

     1364                                          $tmp .= UTF8_REPLACEMENT;

     1365                                          $dump = $sort = 0;

     1366   

     1367                                          $tmp_pos = $starter_pos = $pos;

     1368                                          continue 2;

     1369                                      }

     1370                                  break;

     1371                              }

     1372                          }

     1373                      }

     1374                      else

     1375                      {

     1376                          // Hangul syllable

     1377                          $idx = (((ord($utf_char[0]) & 0x0F) << 12) | ((ord($utf_char[1]) & 0x3F) << 6) | (ord($utf_char[2]) & 0x3F)) - UNICODE_HANGUL_SBASE;

     1378   

     1379                          // LIndex can only range from 0 to 18, therefore it cannot influence the first two bytes of the L Jamo, which allows us to hardcode them (based on LBase).

     1380                          //

     1381                          // The same goes for VIndex, but for TIndex there's a catch: the value of the third byte could exceed 0xBF and we would have to increment the second byte

     1382                          if ($t_index = $idx % UNICODE_HANGUL_TCOUNT)

     1383                          {

     1384                              if ($t_index < 25)

     1385                              {

     1386                                  $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x86\x00";

     1387                                  $utf_char[8] = chr(0xA7 + $t_index);

     1388                              }

     1389                              else

     1390                              {

     1391                                  $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x87\x00";

     1392                                  $utf_char[8] = chr(0x67 + $t_index);

     1393                              }

     1394                          }

     1395                          else

     1396                          {

     1397                              $utf_char = "\xE1\x84\x00\xE1\x85\x00";

     1398                          }

     1399   

     1400                          $utf_char[2] = chr(0x80 + (int) ($idx / UNICODE_HANGUL_NCOUNT));

     1401                          $utf_char[5] = chr(0xA1 + (int) (($idx % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT));

     1402   

     1403                          // Just like other decompositions, the resulting Jamos must be dumped to the tmp string

     1404                          $dump = 1;

     1405                      }

     1406   

     1407                      // Do we need to dump stuff to the tmp string?

     1408                      if ($dump)

     1409                      {

     1410                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1411   

     1412                          // Dump combiners

     1413                          if (!empty($utf_sort))

     1414                          {

     1415                              if ($sort)

     1416                              {

     1417                                  ksort($utf_sort);

     1418                              }

     1419   

     1420                              foreach ($utf_sort as $utf_chars)

     1421                              {

     1422                                  $tmp .= implode('', $utf_chars);

     1423                              }

     1424                          }

     1425   

     1426                          $tmp .= $utf_char;

     1427                          $dump = $sort = 0;

     1428                          $tmp_pos = $pos;

     1429                      }

     1430   

     1431                      $last_cc = 0;

     1432                      $utf_sort = array();

     1433                      $starter_pos = $pos;

     1434                  }

     1435              }

     1436              else

     1437              {

     1438                  // ASCII char, which happens to be a starter (as any other ASCII char)

     1439                  if ($dump)

     1440                  {

     1441                      $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1442   

     1443                      // Dump combiners

     1444                      if (!empty($utf_sort))

     1445                      {

     1446                          if ($sort)

     1447                          {

     1448                              ksort($utf_sort);

     1449                          }

     1450   

     1451                          foreach ($utf_sort as $utf_chars)

     1452                          {

     1453                              $tmp .= implode('', $utf_chars);

     1454                          }

     1455                      }

     1456   

     1457                      $tmp .= $str[$pos];

     1458                      $dump = $sort = 0;

     1459                      $tmp_pos = ++$pos;

     1460   

     1461                      $pos += strspn($str, UTF8_ASCII_RANGE, $pos);

     1462                  }

     1463                  else

     1464                  {

     1465                      $pos += strspn($str, UTF8_ASCII_RANGE, ++$pos);

     1466                  }

     1467   

     1468                  $last_cc = 0;

     1469                  $utf_sort = array();

     1470                  $starter_pos = $pos;

     1471              }

     1472          }

     1473          while ($pos < $len);

     1474   

     1475          // Now is time to return the string

     1476          if ($dump)

     1477          {

     1478              $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1479   

     1480              // Dump combiners

     1481              if (!empty($utf_sort))

     1482              {

     1483                  if ($sort)

     1484                  {

     1485                      ksort($utf_sort);

     1486                  }

     1487   

     1488                  foreach ($utf_sort as $utf_chars)

     1489                  {

     1490                      $tmp .= implode('', $utf_chars);

     1491                  }

     1492              }

     1493   

     1494              return $tmp;

     1495          }

     1496          else if ($tmp_pos)

     1497          {

     1498              // If the $tmp_pos cursor was moved then at least one character was not in normal form. Replace $str with the fixed version

     1499              if ($tmp_pos == $len)

     1500              {

     1501                  // The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str

     1502                  return $tmp;

     1503              }

     1504              else

     1505              {

     1506                  // The rightmost chunk of $str has not been appended to $tmp yet

     1507                  return $tmp . substr($str, $tmp_pos);

     1508              }

     1509          }

     1510   

     1511          // The string was already in normal form

     1512          return $str;

     1513      }

     1514  }

     1515   

     1516  ?>