wimCMS • Blick zurück von phpBB 1.0.0 bis heute

Verzeichnisstruktur phpBB-3.1.0

Veröffentlicht: 27.10.2014

So funktioniert es

Verzeichnis-Info phpBB-1.0.0 / auth.php	Auf das letzte Element klicken. Dies geht jeweils ein Schritt zurück
admin	Auf das Icon klicken, dies öffnet das Verzeichnis. Nochmal klicken schließt das Verzeichnis. Auf den Verzeichnisnamen klicken, dies zeigt nur das Verzeichnis mit Inhalt an
(Beispiel Datei-Icons)	Auf das Icon klicken um den Quellcode anzuzeigen

utf_normalizer.php

Zuletzt modifiziert: 09.10.2024, 12:52 - Dateigröße: 41.87 KiB


     0001  <?php

     0002  /**

     0003  *

     0004  * This file is part of the phpBB Forum Software package.

     0005  *

     0006  * @copyright (c) phpBB Limited <https://www.phpbb.com>

     0007  * @license GNU General Public License, version 2 (GPL-2.0)

     0008  *

     0009  * For full copyright and license information, please see

     0010  * the docs/CREDITS.txt file.

     0011  *

     0012  */

     0013   

     0014  /**

     0015  */

     0016  if (!defined('IN_PHPBB'))

     0017  {

     0018      exit;

     0019  }

     0020   

     0021  /**

     0022  * Some Unicode characters encoded in UTF-8

     0023  *

     0024  * Preserved for compatibility

     0025  */

     0026  define('UTF8_REPLACEMENT', "\xEF\xBF\xBD");

     0027  define('UTF8_MAX', "\xF4\x8F\xBF\xBF");

     0028  define('UTF8_FFFE', "\xEF\xBF\xBE");

     0029  define('UTF8_FFFF', "\xEF\xBF\xBF");

     0030  define('UTF8_SURROGATE_FIRST', "\xED\xA0\x80");

     0031  define('UTF8_SURROGATE_LAST', "\xED\xBF\xBF");

     0032  define('UTF8_HANGUL_FIRST', "\xEA\xB0\x80");

     0033  define('UTF8_HANGUL_LAST', "\xED\x9E\xA3");

     0034   

     0035  define('UTF8_CJK_FIRST', "\xE4\xB8\x80");

     0036  define('UTF8_CJK_LAST', "\xE9\xBE\xBB");

     0037  define('UTF8_CJK_B_FIRST', "\xF0\xA0\x80\x80");

     0038  define('UTF8_CJK_B_LAST', "\xF0\xAA\x9B\x96");

     0039   

     0040  // Unset global variables

     0041  unset($GLOBALS['utf_jamo_index'], $GLOBALS['utf_jamo_type'], $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_combining_class'], $GLOBALS['utf_canonical_comp'], $GLOBALS['utf_canonical_decomp'], $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);

     0042   

     0043  // NFC_QC and NFKC_QC values

     0044  define('UNICODE_QC_MAYBE', 0);

     0045  define('UNICODE_QC_NO', 1);

     0046   

     0047  // Contains all the ASCII characters appearing in UTF-8, sorted by frequency

     0048  define('UTF8_ASCII_RANGE', "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F");

     0049   

     0050  // Contains all the tail bytes that can appear in the composition of a UTF-8 char

     0051  define('UTF8_TRAILING_BYTES', "\xA9\xA0\xA8\x80\xAA\x99\xA7\xBB\xAB\x89\x94\x82\xB4\xA2\xAE\x83\xB0\xB9\xB8\x93\xAF\xBC\xB3\x81\xA4\xB2\x9C\xA1\xB5\xBE\xBD\xBA\x98\xAD\xB1\x84\x95\xA6\xB6\x88\x8D\x90\xB7\xBF\x92\x85\xA5\x97\x8C\x86\xA3\x8E\x9F\x8F\x87\x91\x9D\xAC\x9E\x8B\x96\x9B\x8A\x9A");

     0052   

     0053  // Constants used by the Hangul [de]composition algorithms

     0054  define('UNICODE_HANGUL_SBASE', 0xAC00);

     0055  define('UNICODE_HANGUL_LBASE', 0x1100);

     0056  define('UNICODE_HANGUL_VBASE', 0x1161);

     0057  define('UNICODE_HANGUL_TBASE', 0x11A7);

     0058  define('UNICODE_HANGUL_SCOUNT', 11172);

     0059  define('UNICODE_HANGUL_LCOUNT', 19);

     0060  define('UNICODE_HANGUL_VCOUNT', 21);

     0061  define('UNICODE_HANGUL_TCOUNT', 28);

     0062  define('UNICODE_HANGUL_NCOUNT', 588);

     0063  define('UNICODE_JAMO_L', 0);

     0064  define('UNICODE_JAMO_V', 1);

     0065  define('UNICODE_JAMO_T', 2);

     0066   

     0067  /**

     0068  * Unicode normalization routines

     0069  */

     0070  class utf_normalizer

     0071  {

     0072      /**

     0073      * Validate, cleanup and normalize a string

     0074      *

     0075      * The ultimate convenience function! Clean up invalid UTF-8 sequences,

     0076      * and convert to Normal Form C, canonical composition.

     0077      *

     0078      * @param    string    &$str    The dirty string

     0079      * @return    string            The same string, all shiny and cleaned-up

     0080      */

     0081      static function cleanup(&$str)

     0082      {

     0083          // The string below is the list of all autorized characters, sorted by frequency in latin text

     0084          $pos = strspn($str, "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x0D");

     0085          $len = strlen($str);

     0086   

     0087          if ($pos == $len)

     0088          {

     0089              // ASCII strings with no special chars return immediately

     0090              return;

     0091          }

     0092   

     0093          // Note: we do not check for $GLOBALS['utf_canonical_decomp']. It is assumed they are always loaded together

     0094          if (!isset($GLOBALS['utf_nfc_qc']))

     0095          {

     0096              global $phpbb_root_path, $phpEx;

     0097              include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);

     0098          }

     0099   

     0100          if (!isset($GLOBALS['utf_canonical_decomp']))

     0101          {

     0102              global $phpbb_root_path, $phpEx;

     0103              include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);

     0104          }

     0105   

     0106          // Replace any byte in the range 0x00..0x1F, except for \r, \n and \t

     0107          // We replace those characters with a 0xFF byte, which is illegal in UTF-8 and will in turn be replaced with a UTF replacement char

     0108          $str = strtr(

     0109              $str,

     0110              "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",

     0111              "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"

     0112          );

     0113   

     0114          $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);

     0115      }

     0116   

     0117      /**

     0118      * Validate and normalize a UTF string to NFC

     0119      *

     0120      * @param    string    &$str    Unchecked UTF string

     0121      * @return    string            The string, validated and in normal form

     0122      */

     0123      static function nfc(&$str)

     0124      {

     0125          $pos = strspn($str, UTF8_ASCII_RANGE);

     0126          $len = strlen($str);

     0127   

     0128          if ($pos == $len)

     0129          {

     0130              // ASCII strings return immediately

     0131              return;

     0132          }

     0133   

     0134          if (!isset($GLOBALS['utf_nfc_qc']))

     0135          {

     0136              global $phpbb_root_path, $phpEx;

     0137              include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);

     0138          }

     0139   

     0140          if (!isset($GLOBALS['utf_canonical_decomp']))

     0141          {

     0142              global $phpbb_root_path, $phpEx;

     0143              include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);

     0144          }

     0145   

     0146          $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);

     0147      }

     0148   

     0149      /**

     0150      * Validate and normalize a UTF string to NFKC

     0151      *

     0152      * @param    string    &$str    Unchecked UTF string

     0153      * @return    string            The string, validated and in normal form

     0154      */

     0155      static function nfkc(&$str)

     0156      {

     0157          $pos = strspn($str, UTF8_ASCII_RANGE);

     0158          $len = strlen($str);

     0159   

     0160          if ($pos == $len)

     0161          {

     0162              // ASCII strings return immediately

     0163              return;

     0164          }

     0165   

     0166          if (!isset($GLOBALS['utf_nfkc_qc']))

     0167          {

     0168              global $phpbb_root_path, $phpEx;

     0169              include($phpbb_root_path . 'includes/utf/data/utf_nfkc_qc.' . $phpEx);

     0170          }

     0171   

     0172          if (!isset($GLOBALS['utf_compatibility_decomp']))

     0173          {

     0174              global $phpbb_root_path, $phpEx;

     0175              include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);

     0176          }

     0177   

     0178          $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);

     0179      }

     0180   

     0181      /**

     0182      * Validate and normalize a UTF string to NFD

     0183      *

     0184      * @param    string    &$str    Unchecked UTF string

     0185      * @return    string            The string, validated and in normal form

     0186      */

     0187      static function nfd(&$str)

     0188      {

     0189          $pos = strspn($str, UTF8_ASCII_RANGE);

     0190          $len = strlen($str);

     0191   

     0192          if ($pos == $len)

     0193          {

     0194              // ASCII strings return immediately

     0195              return;

     0196          }

     0197   

     0198          if (!isset($GLOBALS['utf_canonical_decomp']))

     0199          {

     0200              global $phpbb_root_path, $phpEx;

     0201              include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);

     0202          }

     0203   

     0204          $str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_canonical_decomp']);

     0205      }

     0206   

     0207      /**

     0208      * Validate and normalize a UTF string to NFKD

     0209      *

     0210      * @param    string    &$str    Unchecked UTF string

     0211      * @return    string            The string, validated and in normal form

     0212      */

     0213      static function nfkd(&$str)

     0214      {

     0215          $pos = strspn($str, UTF8_ASCII_RANGE);

     0216          $len = strlen($str);

     0217   

     0218          if ($pos == $len)

     0219          {

     0220              // ASCII strings return immediately

     0221              return;

     0222          }

     0223   

     0224          if (!isset($GLOBALS['utf_compatibility_decomp']))

     0225          {

     0226              global $phpbb_root_path, $phpEx;

     0227              include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);

     0228          }

     0229   

     0230          $str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_compatibility_decomp']);

     0231      }

     0232   

     0233   

     0234      /**

     0235      * Recompose a UTF string

     0236      *

     0237      * @param    string    $str            Unchecked UTF string

     0238      * @param    integer    $pos            Position of the first UTF char (in bytes)

     0239      * @param    integer    $len            Length of the string (in bytes)

     0240      * @param    array    &$qc            Quick-check array, passed by reference but never modified

     0241      * @param    array    &$decomp_map    Decomposition mapping, passed by reference but never modified

     0242      * @return    string                    The string, validated and recomposed

     0243      *

     0244      * @access    private

     0245      */

     0246      static function recompose($str, $pos, $len, &$qc, &$decomp_map)

     0247      {

     0248          global $utf_combining_class, $utf_canonical_comp, $utf_jamo_type, $utf_jamo_index;

     0249   

     0250          // Load some commonly-used tables

     0251          if (!isset($utf_jamo_index, $utf_jamo_type, $utf_combining_class))

     0252          {

     0253              global $phpbb_root_path, $phpEx;

     0254              include($phpbb_root_path . 'includes/utf/data/utf_normalizer_common.' . $phpEx);

     0255          }

     0256   

     0257          // Load the canonical composition table

     0258          if (!isset($utf_canonical_comp))

     0259          {

     0260              global $phpbb_root_path, $phpEx;

     0261              include($phpbb_root_path . 'includes/utf/data/utf_canonical_comp.' . $phpEx);

     0262          }

     0263   

     0264          // Buffer the last ASCII char before the UTF-8 stuff if applicable

     0265          $tmp = '';

     0266          $i = $tmp_pos = $last_cc = 0;

     0267   

     0268          $buffer = ($pos) ? array(++$i => $str[$pos - 1]) : array();

     0269   

     0270          // UTF char length array

     0271          // This array is used to determine the length of a UTF character.

     0272          // Be $c the result of ($str[$pos] & "\xF0") --where $str is the string we're operating on and $pos

     0273          // the position of the cursor--, if $utf_len_mask[$c] does not exist, the byte is an ASCII char.

     0274          // Otherwise, if $utf_len_mask[$c] is greater than 0, we have a the leading byte of a multibyte character

     0275          // whose length is $utf_len_mask[$c] and if it is equal to 0, the byte is a trailing byte.

     0276          $utf_len_mask = array(

     0277              // Leading bytes masks

     0278              "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,

     0279              // Trailing bytes masks

     0280              "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0

     0281          );

     0282   

     0283          $extra_check = array(

     0284              "\xED" => 1, "\xEF" => 1, "\xC0" => 1, "\xC1" => 1, "\xE0" => 1, "\xF0" => 1,

     0285              "\xF4" => 1, "\xF5" => 1, "\xF6" => 1, "\xF7" => 1, "\xF8" => 1, "\xF9" => 1,

     0286              "\xFA" => 1, "\xFB" => 1, "\xFC" => 1, "\xFD" => 1, "\xFE" => 1, "\xFF" => 1

     0287          );

     0288   

     0289          $utf_validation_mask = array(

     0290              2    => "\xE0\xC0",

     0291              3    => "\xF0\xC0\xC0",

     0292              4    => "\xF8\xC0\xC0\xC0"

     0293          );

     0294   

     0295          $utf_validation_check = array(

     0296              2    => "\xC0\x80",

     0297              3    => "\xE0\x80\x80",

     0298              4    => "\xF0\x80\x80\x80"

     0299          );

     0300   

     0301          // Main loop

     0302          do

     0303          {

     0304              // STEP 0: Capture the current char and buffer it

     0305              $c = $str[$pos];

     0306              $c_mask = $c & "\xF0";

     0307   

     0308              if (isset($utf_len_mask[$c_mask]))

     0309              {

     0310                  // Byte at $pos is either a leading byte or a missplaced trailing byte

     0311                  if ($utf_len = $utf_len_mask[$c_mask])

     0312                  {

     0313                      // Capture the char

     0314                      $buffer[++$i & 7] = $utf_char = substr($str, $pos, $utf_len);

     0315   

     0316                      // Let's find out if a thorough check is needed

     0317                      if (isset($qc[$utf_char]))

     0318                      {

     0319                          // If the UTF char is in the qc array then it may not be in normal form. We do nothing here, the actual processing is below this "if" block

     0320                      }

     0321                      else if (isset($utf_combining_class[$utf_char]))

     0322                      {

     0323                          if ($utf_combining_class[$utf_char] < $last_cc)

     0324                          {

     0325                              // A combining character that is NOT canonically ordered

     0326                          }

     0327                          else

     0328                          {

     0329                              // A combining character that IS canonically ordered, skip to the next char

     0330                              $last_cc = $utf_combining_class[$utf_char];

     0331   

     0332                              $pos += $utf_len;

     0333                              continue;

     0334                          }

     0335                      }

     0336                      else

     0337                      {

     0338                          // At this point, $utf_char holds a UTF char that we know is not a NF[K]C_QC and is not a combining character.

     0339                          // It can be a singleton, a canonical composite, a replacement char or an even an ill-formed bunch of bytes. Let's find out

     0340                          $last_cc = 0;

     0341   

     0342                          // Check that we have the correct number of trailing bytes

     0343                          if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len])

     0344                          {

     0345                              // Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char

     0346                              // has been encoded in a five- or six- byte sequence

     0347                              if ($utf_char[0] >= "\xF8")

     0348                              {

     0349                                  if ($utf_char[0] < "\xFC")

     0350                                  {

     0351                                      $trailing_bytes = 4;

     0352                                  }

     0353                                  else if ($utf_char[0] > "\xFD")

     0354                                  {

     0355                                      $trailing_bytes = 0;

     0356                                  }

     0357                                  else

     0358                                  {

     0359                                      $trailing_bytes = 5;

     0360                                  }

     0361                              }

     0362                              else

     0363                              {

     0364                                  $trailing_bytes = $utf_len - 1;

     0365                              }

     0366   

     0367                              $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;

     0368                              $pos += strspn($str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes);

     0369                              $tmp_pos = $pos;

     0370   

     0371                              continue;

     0372                          }

     0373   

     0374                          if (isset($extra_check[$c]))

     0375                          {

     0376                              switch ($c)

     0377                              {

     0378                                  // Note: 0xED is quite common in Korean

     0379                                  case "\xED":

     0380                                      if ($utf_char >= "\xED\xA0\x80")

     0381                                      {

     0382                                          // Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF)

     0383                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;

     0384                                          $pos += $utf_len;

     0385                                          $tmp_pos = $pos;

     0386                                          continue 2;

     0387                                      }

     0388                                  break;

     0389   

     0390                                  // Note: 0xEF is quite common in Japanese

     0391                                  case "\xEF":

     0392                                      if ($utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF")

     0393                                      {

     0394                                          // U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF)

     0395                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;

     0396                                          $pos += $utf_len;

     0397                                          $tmp_pos = $pos;

     0398                                          continue 2;

     0399                                      }

     0400                                  break;

     0401   

     0402                                  case "\xC0":

     0403                                  case "\xC1":

     0404                                      if ($utf_char <= "\xC1\xBF")

     0405                                      {

     0406                                          // Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char

     0407                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;

     0408                                          $pos += $utf_len;

     0409                                          $tmp_pos = $pos;

     0410                                          continue 2;

     0411                                      }

     0412                                  break;

     0413   

     0414                                  case "\xE0":

     0415                                      if ($utf_char <= "\xE0\x9F\xBF")

     0416                                      {

     0417                                          // Unicode char U+0000..U+07FF encoded in 3 bytes

     0418                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;

     0419                                          $pos += $utf_len;

     0420                                          $tmp_pos = $pos;

     0421                                          continue 2;

     0422                                      }

     0423                                  break;

     0424   

     0425                                  case "\xF0":

     0426                                      if ($utf_char <= "\xF0\x8F\xBF\xBF")

     0427                                      {

     0428                                          // Unicode char U+0000..U+FFFF encoded in 4 bytes

     0429                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;

     0430                                          $pos += $utf_len;

     0431                                          $tmp_pos = $pos;

     0432                                          continue 2;

     0433                                      }

     0434                                  break;

     0435   

     0436                                  default:

     0437                                      // Five- and six- byte sequences do not need being checked for here anymore

     0438                                      if ($utf_char > UTF8_MAX)

     0439                                      {

     0440                                          // Out of the Unicode range

     0441                                          if ($utf_char[0] < "\xF8")

     0442                                          {

     0443                                              $trailing_bytes = 3;

     0444                                          }

     0445                                          else if ($utf_char[0] < "\xFC")

     0446                                          {

     0447                                              $trailing_bytes = 4;

     0448                                          }

     0449                                          else if ($utf_char[0] > "\xFD")

     0450                                          {

     0451                                              $trailing_bytes = 0;

     0452                                          }

     0453                                          else

     0454                                          {

     0455                                              $trailing_bytes = 5;

     0456                                          }

     0457   

     0458                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;

     0459                                          $pos += strspn($str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes);

     0460                                          $tmp_pos = $pos;

     0461                                          continue 2;

     0462                                      }

     0463                                  break;

     0464                              }

     0465                          }

     0466   

     0467                          // The char is a valid starter, move the cursor and go on

     0468                          $pos += $utf_len;

     0469                          continue;

     0470                      }

     0471                  }

     0472                  else

     0473                  {

     0474                      // A trailing byte came out of nowhere, we will advance the cursor and treat the this byte and all following trailing bytes as if

     0475                      // each of them was a Unicode replacement char

     0476                      $spn = strspn($str, UTF8_TRAILING_BYTES, $pos);

     0477                      $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn);

     0478   

     0479                      $pos += $spn;

     0480                      $tmp_pos = $pos;

     0481                      continue;

     0482                  }

     0483   

     0484                  // STEP 1: Decompose current char

     0485   

     0486                  // We have found a character that is either:

     0487                  //  - in the NFC_QC/NFKC_QC list

     0488                  //  - a non-starter char that is not canonically ordered

     0489                  //

     0490                  // We are going to capture the shortest UTF sequence that satisfies these two conditions:

     0491                  //

     0492                  //  1 - If the sequence does not start at the begginning of the string, it must begin with a starter,

     0493                  // and that starter must not have the NF[K]C_QC property equal to "MAYBE"

     0494                  //

     0495                  //  2 - If the sequence does not end at the end of the string, it must end with a non-starter and be

     0496                  // immediately followed by a starter that is not on the QC list

     0497                  //

     0498                  $utf_seq = array();

     0499                  $last_cc = 0;

     0500                  $lpos = $pos;

     0501                  $pos += $utf_len;

     0502   

     0503                  if (isset($decomp_map[$utf_char]))

     0504                  {

     0505                      $_pos = 0;

     0506                      $_len = strlen($decomp_map[$utf_char]);

     0507   

     0508                      do

     0509                      {

     0510                          $_utf_len =& $utf_len_mask[$decomp_map[$utf_char][$_pos] & "\xF0"];

     0511   

     0512                          if (isset($_utf_len))

     0513                          {

     0514                              $utf_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);

     0515                              $_pos += $_utf_len;

     0516                          }

     0517                          else

     0518                          {

     0519                              $utf_seq[] = $decomp_map[$utf_char][$_pos];

     0520                              ++$_pos;

     0521                          }

     0522                      }

     0523                      while ($_pos < $_len);

     0524                  }

     0525                  else

     0526                  {

     0527                      // The char is not decomposable

     0528                      $utf_seq = array($utf_char);

     0529                  }

     0530   

     0531                  // STEP 2: Capture the starter

     0532   

     0533                  // Check out the combining class of the first character of the UTF sequence

     0534                  $k = 0;

     0535                  if (isset($utf_combining_class[$utf_seq[0]]) || $qc[$utf_char] == UNICODE_QC_MAYBE)

     0536                  {

     0537                      // Not a starter, inspect previous characters

     0538                      // The last 8 characters are kept in a buffer so that we don't have to capture them everytime.

     0539                      // This is enough for all real-life strings but even if it wasn't, we can capture characters in backward mode,

     0540                      // although it is slower than this method.

     0541                      //

     0542                      // In the following loop, $j starts at the previous buffered character ($i - 1, because current character is

     0543                      // at offset $i) and process them in backward mode until we find a starter.

     0544                      //

     0545                      // $k is the index on each UTF character inside of our UTF sequence. At this time, $utf_seq contains one or more

     0546                      // characters numbered 0 to n. $k starts at 0 and for each char we prepend we pre-decrement it and for numbering

     0547                      $starter_found = 0;

     0548                      $j_min = max(1, $i - 7);

     0549   

     0550                      for ($j = $i - 1; $j >= $j_min && $lpos > $tmp_pos; --$j)

     0551                      {

     0552                          $utf_char = $buffer[$j & 7];

     0553                          $lpos -= strlen($utf_char);

     0554   

     0555                          if (isset($decomp_map[$utf_char]))

     0556                          {

     0557                              // The char is a composite, decompose for storage

     0558                              $decomp_seq = array();

     0559                              $_pos = 0;

     0560                              $_len = strlen($decomp_map[$utf_char]);

     0561   

     0562                              do

     0563                              {

     0564                                  $c = $decomp_map[$utf_char][$_pos];

     0565                                  $_utf_len =& $utf_len_mask[$c & "\xF0"];

     0566   

     0567                                  if (isset($_utf_len))

     0568                                  {

     0569                                      $decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);

     0570                                      $_pos += $_utf_len;

     0571                                  }

     0572                                  else

     0573                                  {

     0574                                      $decomp_seq[] = $c;

     0575                                      ++$_pos;

     0576                                  }

     0577                              }

     0578                              while ($_pos < $_len);

     0579   

     0580                              // Prepend the UTF sequence with our decomposed sequence

     0581                              if (isset($decomp_seq[1]))

     0582                              {

     0583                                  // The char expanded into several chars

     0584                                  $decomp_cnt = sizeof($decomp_seq);

     0585   

     0586                                  foreach ($decomp_seq as $decomp_i => $decomp_char)

     0587                                  {

     0588                                      $utf_seq[$k + $decomp_i - $decomp_cnt] = $decomp_char;

     0589                                  }

     0590                                  $k -= $decomp_cnt;

     0591                              }

     0592                              else

     0593                              {

     0594                                  // Decomposed to a single char, easier to prepend

     0595                                  $utf_seq[--$k] = $decomp_seq[0];

     0596                              }

     0597                          }

     0598                          else

     0599                          {

     0600                              $utf_seq[--$k] = $utf_char;

     0601                          }

     0602   

     0603                          if (!isset($utf_combining_class[$utf_seq[$k]]))

     0604                          {

     0605                              // We have found our starter

     0606                              $starter_found = 1;

     0607                              break;

     0608                          }

     0609                      }

     0610   

     0611                      if (!$starter_found && $lpos > $tmp_pos)

     0612                      {

     0613                          // The starter was not found in the buffer, let's rewind some more

     0614                          do

     0615                          {

     0616                              // $utf_len_mask contains the masks of both leading bytes and trailing bytes. If $utf_en > 0 then it's a leading byte, otherwise it's a trailing byte.

     0617                              $c = $str[--$lpos];

     0618                              $c_mask = $c & "\xF0";

     0619   

     0620                              if (isset($utf_len_mask[$c_mask]))

     0621                              {

     0622                                  // UTF byte

     0623                                  if ($utf_len = $utf_len_mask[$c_mask])

     0624                                  {

     0625                                      // UTF *leading* byte

     0626                                      $utf_char = substr($str, $lpos, $utf_len);

     0627   

     0628                                      if (isset($decomp_map[$utf_char]))

     0629                                      {

     0630                                          // Decompose the character

     0631                                          $decomp_seq = array();

     0632                                          $_pos = 0;

     0633                                          $_len = strlen($decomp_map[$utf_char]);

     0634   

     0635                                          do

     0636                                          {

     0637                                              $c = $decomp_map[$utf_char][$_pos];

     0638                                              $_utf_len =& $utf_len_mask[$c & "\xF0"];

     0639   

     0640                                              if (isset($_utf_len))

     0641                                              {

     0642                                                  $decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);

     0643                                                  $_pos += $_utf_len;

     0644                                              }

     0645                                              else

     0646                                              {

     0647                                                  $decomp_seq[] = $c;

     0648                                                  ++$_pos;

     0649                                              }

     0650                                          }

     0651                                          while ($_pos < $_len);

     0652   

     0653                                          // Prepend the UTF sequence with our decomposed sequence

     0654                                          if (isset($decomp_seq[1]))

     0655                                          {

     0656                                              // The char expanded into several chars

     0657                                              $decomp_cnt = sizeof($decomp_seq);

     0658                                              foreach ($decomp_seq as $decomp_i => $utf_char)

     0659                                              {

     0660                                                  $utf_seq[$k + $decomp_i - $decomp_cnt] = $utf_char;

     0661                                              }

     0662                                              $k -= $decomp_cnt;

     0663                                          }

     0664                                          else

     0665                                          {

     0666                                              // Decomposed to a single char, easier to prepend

     0667                                              $utf_seq[--$k] = $decomp_seq[0];

     0668                                          }

     0669                                      }

     0670                                      else

     0671                                      {

     0672                                          $utf_seq[--$k] = $utf_char;

     0673                                      }

     0674                                  }

     0675                              }

     0676                              else

     0677                              {

     0678                                  // ASCII char

     0679                                  $utf_seq[--$k] = $c;

     0680                              }

     0681                          }

     0682                          while ($lpos > $tmp_pos);

     0683                      }

     0684                  }

     0685   

     0686                  // STEP 3: Capture following combining modifiers

     0687   

     0688                  while ($pos < $len)

     0689                  {

     0690                      $c_mask = $str[$pos] & "\xF0";

     0691   

     0692                      if (isset($utf_len_mask[$c_mask]))

     0693                      {

     0694                          if ($utf_len = $utf_len_mask[$c_mask])

     0695                          {

     0696                              $utf_char = substr($str, $pos, $utf_len);

     0697                          }

     0698                          else

     0699                          {

     0700                              // A trailing byte came out of nowhere

     0701                              // Trailing bytes are replaced with Unicode replacement chars, we will just ignore it for now, break out of the loop

     0702                              // as if it was a starter (replacement chars ARE starters) and let the next loop replace it

     0703                              break;

     0704                          }

     0705   

     0706                          if (isset($utf_combining_class[$utf_char]) || isset($qc[$utf_char]))

     0707                          {

     0708                              // Combining character, add it to the sequence and move the cursor

     0709                              if (isset($decomp_map[$utf_char]))

     0710                              {

     0711                                  // Decompose the character

     0712                                  $_pos = 0;

     0713                                  $_len = strlen($decomp_map[$utf_char]);

     0714   

     0715                                  do

     0716                                  {

     0717                                      $c = $decomp_map[$utf_char][$_pos];

     0718                                      $_utf_len =& $utf_len_mask[$c & "\xF0"];

     0719   

     0720                                      if (isset($_utf_len))

     0721                                      {

     0722                                          $utf_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);

     0723                                          $_pos += $_utf_len;

     0724                                      }

     0725                                      else

     0726                                      {

     0727                                          $utf_seq[] = $c;

     0728                                          ++$_pos;

     0729                                      }

     0730                                  }

     0731                                  while ($_pos < $_len);

     0732                              }

     0733                              else

     0734                              {

     0735                                  $utf_seq[] = $utf_char;

     0736                              }

     0737   

     0738                              $pos += $utf_len;

     0739                          }

     0740                          else

     0741                          {

     0742                              // Combining class 0 and no QC, break out of the loop

     0743                              // Note: we do not know if that character is valid. If it's not, the next iteration will replace it

     0744                              break;

     0745                          }

     0746                      }

     0747                      else

     0748                      {

     0749                          // ASCII chars are starters

     0750                          break;

     0751                      }

     0752                  }

     0753   

     0754                  // STEP 4: Sort and combine

     0755   

     0756                  // Here we sort...

     0757                  $k_max = $k + sizeof($utf_seq);

     0758   

     0759                  if (!$k && $k_max == 1)

     0760                  {

     0761                      // There is only one char in the UTF sequence, add it then jump to the next iteration of main loop

     0762                          // Note: the two commented lines below can be enabled under PHP5 for a very small performance gain in most cases

     0763  //                        if (substr_compare($str, $utf_seq[0], $lpos, $pos - $lpos))

     0764  //                        {

     0765                          $tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $utf_seq[0];

     0766                          $tmp_pos = $pos;

     0767  //                        }

     0768   

     0769                      continue;

     0770                  }

     0771   

     0772                  // ...there we combine

     0773                  if (isset($utf_combining_class[$utf_seq[$k]]))

     0774                  {

     0775                      $starter = $nf_seq = '';

     0776                  }

     0777                  else

     0778                  {

     0779                      $starter = $utf_seq[$k++];

     0780                      $nf_seq = '';

     0781                  }

     0782                  $utf_sort = array();

     0783   

     0784                  // We add an empty char at the end of the UTF char sequence. It will act as a starter and trigger the sort/combine routine

     0785                  // at the end of the string without altering it

     0786                  $utf_seq[] = '';

     0787   

     0788                  do

     0789                  {

     0790                      $utf_char = $utf_seq[$k++];

     0791   

     0792                      if (isset($utf_combining_class[$utf_char]))

     0793                      {

     0794                          $utf_sort[$utf_combining_class[$utf_char]][] = $utf_char;

     0795                      }

     0796                      else

     0797                      {

     0798                          if (empty($utf_sort))

     0799                          {

     0800                              // No combining characters... check for a composite of the two starters

     0801                              if (isset($utf_canonical_comp[$starter . $utf_char]))

     0802                              {

     0803                                  // Good ol' composite character

     0804                                  $starter = $utf_canonical_comp[$starter . $utf_char];

     0805                              }

     0806                              else if (isset($utf_jamo_type[$utf_char]))

     0807                              {

     0808                                  // Current char is a composable jamo

     0809                                  if (isset($utf_jamo_type[$starter]) && $utf_jamo_type[$starter] == UNICODE_JAMO_L && $utf_jamo_type[$utf_char] == UNICODE_JAMO_V)

     0810                                  {

     0811                                      // We have a L jamo followed by a V jamo, we are going to prefetch the next char to see if it's a T jamo

     0812                                      if (isset($utf_jamo_type[$utf_seq[$k]]) && $utf_jamo_type[$utf_seq[$k]] == UNICODE_JAMO_T)

     0813                                      {

     0814                                          // L+V+T jamos, combine to a LVT Hangul syllable ($k is incremented)

     0815                                          $cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char] + $utf_jamo_index[$utf_seq[$k]];

     0816                                          ++$k;

     0817                                      }

     0818                                      else

     0819                                      {

     0820                                          // L+V jamos, combine to a LV Hangul syllable

     0821                                          $cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char];

     0822                                      }

     0823   

     0824                                      $starter = chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));

     0825                                  }

     0826                                  else

     0827                                  {

     0828                                      // Non-composable jamo, just add it to the sequence

     0829                                      $nf_seq .= $starter;

     0830                                      $starter = $utf_char;

     0831                                  }

     0832                              }

     0833                              else

     0834                              {

     0835                                  // No composite, just add the first starter to the sequence then continue with the other one

     0836                                  $nf_seq .= $starter;

     0837                                  $starter = $utf_char;

     0838                              }

     0839                          }

     0840                          else

     0841                          {

     0842                              ksort($utf_sort);

     0843   

     0844                              // For each class of combining characters

     0845                              foreach ($utf_sort as $cc => $utf_chars)

     0846                              {

     0847                                  $j = 0;

     0848   

     0849                                  do

     0850                                  {

     0851                                      // Look for a composite

     0852                                      if (isset($utf_canonical_comp[$starter . $utf_chars[$j]]))

     0853                                      {

     0854                                          // Found a composite, replace the starter

     0855                                          $starter = $utf_canonical_comp[$starter . $utf_chars[$j]];

     0856                                          unset($utf_sort[$cc][$j]);

     0857                                      }

     0858                                      else

     0859                                      {

     0860                                          // No composite, all following characters in that class are blocked

     0861                                          break;

     0862                                      }

     0863                                  }

     0864                                  while (isset($utf_sort[$cc][++$j]));

     0865                              }

     0866   

     0867                              // Add the starter to the normalized sequence, followed by non-starters in canonical order

     0868                              $nf_seq .= $starter;

     0869   

     0870                              foreach ($utf_sort as $utf_chars)

     0871                              {

     0872                                  if (!empty($utf_chars))

     0873                                  {

     0874                                      $nf_seq .= implode('', $utf_chars);

     0875                                  }

     0876                              }

     0877   

     0878                              // Reset the array and go on

     0879                              $utf_sort = array();

     0880                              $starter = $utf_char;

     0881                          }

     0882                      }

     0883                  }

     0884                  while ($k <= $k_max);

     0885   

     0886                  $tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $nf_seq;

     0887                  $tmp_pos = $pos;

     0888              }

     0889              else

     0890              {

     0891                  // Only a ASCII char can make the program get here

     0892                  //

     0893                  // First we skip the current byte with ++$pos, then we quickly skip following ASCII chars with strspn().

     0894                  //

     0895                  // The first two "if"'s here can be removed, with the consequences of being faster on latin text (lots of ASCII) and slower on

     0896                  // multi-byte text (where the only ASCII chars are spaces and punctuation)

     0897                  if (++$pos != $len)

     0898                  {

     0899                      if ($str[$pos] < "\x80")

     0900                      {

     0901                          $pos += strspn($str, UTF8_ASCII_RANGE, ++$pos);

     0902                          $buffer[++$i & 7] = $str[$pos - 1];

     0903                      }

     0904                      else

     0905                      {

     0906                          $buffer[++$i & 7] = $c;

     0907                      }

     0908                  }

     0909              }

     0910          }

     0911          while ($pos < $len);

     0912   

     0913          // Now is time to return the string

     0914          if ($tmp_pos)

     0915          {

     0916              // If the $tmp_pos cursor is not at the beggining of the string then at least one character was not in normal form. Replace $str with the fixed version

     0917              if ($tmp_pos == $len)

     0918              {

     0919                  // The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str

     0920                  return $tmp;

     0921              }

     0922              else

     0923              {

     0924                  // The rightmost chunk of $str has not been appended to $tmp yet

     0925                  return $tmp . substr($str, $tmp_pos);

     0926              }

     0927          }

     0928   

     0929          // The string was already in normal form

     0930          return $str;

     0931      }

     0932   

     0933      /**

     0934      * Decompose a UTF string

     0935      *

     0936      * @param    string    $str            UTF string

     0937      * @param    integer    $pos            Position of the first UTF char (in bytes)

     0938      * @param    integer    $len            Length of the string (in bytes)

     0939      * @param    array    &$decomp_map    Decomposition mapping, passed by reference but never modified

     0940      * @return    string                    The string, decomposed and sorted canonically

     0941      *

     0942      * @access    private

     0943      */

     0944      static function decompose($str, $pos, $len, &$decomp_map)

     0945      {

     0946          global $utf_combining_class;

     0947   

     0948          // Load some commonly-used tables

     0949          if (!isset($utf_combining_class))

     0950          {

     0951              global $phpbb_root_path, $phpEx;

     0952              include($phpbb_root_path . 'includes/utf/data/utf_normalizer_common.' . $phpEx);

     0953          }

     0954   

     0955          // UTF char length array

     0956          $utf_len_mask = array(

     0957              // Leading bytes masks

     0958              "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,

     0959              // Trailing bytes masks

     0960              "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0

     0961          );

     0962   

     0963          // Some extra checks are triggered on the first byte of a UTF sequence

     0964          $extra_check = array(

     0965              "\xED" => 1, "\xEF" => 1, "\xC0" => 1, "\xC1" => 1, "\xE0" => 1, "\xF0" => 1,

     0966              "\xF4" => 1, "\xF5" => 1, "\xF6" => 1, "\xF7" => 1, "\xF8" => 1, "\xF9" => 1,

     0967              "\xFA" => 1, "\xFB" => 1, "\xFC" => 1, "\xFD" => 1, "\xFE" => 1, "\xFF" => 1

     0968          );

     0969   

     0970          // These masks are used to check if a UTF sequence is well formed. Here are the only 3 lengths we acknowledge:

     0971          //   - 2-byte: 110? ???? 10?? ????

     0972          //   - 3-byte: 1110 ???? 10?? ???? 10?? ????

     0973          //   - 4-byte: 1111 0??? 10?? ???? 10?? ???? 10?? ????

     0974          // Note that 5- and 6- byte sequences are automatically discarded

     0975          $utf_validation_mask = array(

     0976              2    => "\xE0\xC0",

     0977              3    => "\xF0\xC0\xC0",

     0978              4    => "\xF8\xC0\xC0\xC0"

     0979          );

     0980   

     0981          $utf_validation_check = array(

     0982              2    => "\xC0\x80",

     0983              3    => "\xE0\x80\x80",

     0984              4    => "\xF0\x80\x80\x80"

     0985          );

     0986   

     0987          $tmp = '';

     0988          $starter_pos = $pos;

     0989          $tmp_pos = $last_cc = $sort = $dump = 0;

     0990          $utf_sort = array();

     0991   

     0992          // Main loop

     0993          do

     0994          {

     0995              // STEP 0: Capture the current char

     0996   

     0997              $cur_mask = $str[$pos] & "\xF0";

     0998              if (isset($utf_len_mask[$cur_mask]))

     0999              {

     1000                  if ($utf_len = $utf_len_mask[$cur_mask])

     1001                  {

     1002                      // Multibyte char

     1003                      $utf_char = substr($str, $pos, $utf_len);

     1004                      $pos += $utf_len;

     1005                  }

     1006                  else

     1007                  {

     1008                      // A trailing byte came out of nowhere, we will treat it and all following trailing bytes as if each of them was a Unicode

     1009                      // replacement char and we will advance the cursor

     1010                      $spn = strspn($str, UTF8_TRAILING_BYTES, $pos);

     1011   

     1012                      if ($dump)

     1013                      {

     1014                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1015   

     1016                          // Dump combiners

     1017                          if (!empty($utf_sort))

     1018                          {

     1019                              if ($sort)

     1020                              {

     1021                                  ksort($utf_sort);

     1022                              }

     1023   

     1024                              foreach ($utf_sort as $utf_chars)

     1025                              {

     1026                                  $tmp .= implode('', $utf_chars);

     1027                              }

     1028                          }

     1029   

     1030                          $tmp .= str_repeat(UTF8_REPLACEMENT, $spn);

     1031                          $dump = $sort = 0;

     1032                      }

     1033                      else

     1034                      {

     1035                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn);

     1036                      }

     1037   

     1038                      $pos += $spn;

     1039                      $tmp_pos = $starter_pos = $pos;

     1040   

     1041                      $utf_sort = array();

     1042                      $last_cc = 0;

     1043   

     1044                      continue;

     1045                  }

     1046   

     1047                  // STEP 1: Decide what to do with current char

     1048   

     1049                  // Now, in that order:

     1050                  //  - check if that character is decomposable

     1051                  //  - check if that character is a non-starter

     1052                  //  - check if that character requires extra checks to be performed

     1053                  if (isset($decomp_map[$utf_char]))

     1054                  {

     1055                      // Decompose the char

     1056                      $_pos = 0;

     1057                      $_len = strlen($decomp_map[$utf_char]);

     1058   

     1059                      do

     1060                      {

     1061                          $c = $decomp_map[$utf_char][$_pos];

     1062                          $_utf_len =& $utf_len_mask[$c & "\xF0"];

     1063   

     1064                          if (isset($_utf_len))

     1065                          {

     1066                              $_utf_char = substr($decomp_map[$utf_char], $_pos, $_utf_len);

     1067                              $_pos += $_utf_len;

     1068   

     1069                              if (isset($utf_combining_class[$_utf_char]))

     1070                              {

     1071                                  // The character decomposed to a non-starter, buffer it for sorting

     1072                                  $utf_sort[$utf_combining_class[$_utf_char]][] = $_utf_char;

     1073   

     1074                                  if ($utf_combining_class[$_utf_char] < $last_cc)

     1075                                  {

     1076                                      // Not canonically ordered, will require sorting

     1077                                      $sort = $dump = 1;

     1078                                  }

     1079                                  else

     1080                                  {

     1081                                      $dump = 1;

     1082                                      $last_cc = $utf_combining_class[$_utf_char];

     1083                                  }

     1084                              }

     1085                              else

     1086                              {

     1087                                  // This character decomposition contains a starter, dump the buffer and continue

     1088                                  if ($dump)

     1089                                  {

     1090                                      $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1091   

     1092                                      // Dump combiners

     1093                                      if (!empty($utf_sort))

     1094                                      {

     1095                                          if ($sort)

     1096                                          {

     1097                                              ksort($utf_sort);

     1098                                          }

     1099   

     1100                                          foreach ($utf_sort as $utf_chars)

     1101                                          {

     1102                                              $tmp .= implode('', $utf_chars);

     1103                                          }

     1104                                      }

     1105   

     1106                                      $tmp .= $_utf_char;

     1107                                      $dump = $sort = 0;

     1108                                  }

     1109                                  else

     1110                                  {

     1111                                      $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos) . $_utf_char;

     1112                                  }

     1113   

     1114                                  $tmp_pos = $starter_pos = $pos;

     1115                                  $utf_sort = array();

     1116                                  $last_cc = 0;

     1117                              }

     1118                          }

     1119                          else

     1120                          {

     1121                              // This character decomposition contains an ASCII char, which is a starter. Dump the buffer and continue

     1122                              ++$_pos;

     1123   

     1124                              if ($dump)

     1125                              {

     1126                                  $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1127   

     1128                                  // Dump combiners

     1129                                  if (!empty($utf_sort))

     1130                                  {

     1131                                      if ($sort)

     1132                                      {

     1133                                          ksort($utf_sort);

     1134                                      }

     1135   

     1136                                      foreach ($utf_sort as $utf_chars)

     1137                                      {

     1138                                          $tmp .= implode('', $utf_chars);

     1139                                      }

     1140                                  }

     1141   

     1142                                  $tmp .= $c;

     1143                                  $dump = $sort = 0;

     1144                              }

     1145                              else

     1146                              {

     1147                                  $tmp .= substr($str, $tmp_pos, $pos - $utf_len - $tmp_pos) . $c;

     1148                              }

     1149   

     1150                              $tmp_pos = $starter_pos = $pos;

     1151                              $utf_sort = array();

     1152                              $last_cc = 0;

     1153                          }

     1154                      }

     1155                      while ($_pos < $_len);

     1156                  }

     1157                  else if (isset($utf_combining_class[$utf_char]))

     1158                  {

     1159                      // Combining character

     1160                      if ($utf_combining_class[$utf_char] < $last_cc)

     1161                      {

     1162                          // Not in canonical order

     1163                          $sort = $dump = 1;

     1164                      }

     1165                      else

     1166                      {

     1167                          $last_cc = $utf_combining_class[$utf_char];

     1168                      }

     1169   

     1170                      $utf_sort[$utf_combining_class[$utf_char]][] = $utf_char;

     1171                  }

     1172                  else

     1173                  {

     1174                      // Non-decomposable starter, check out if it's a Hangul syllable

     1175                      if ($utf_char < UTF8_HANGUL_FIRST || $utf_char > UTF8_HANGUL_LAST)

     1176                      {

     1177                          // Nope, regular UTF char, check that we have the correct number of trailing bytes

     1178                          if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len])

     1179                          {

     1180                              // Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char

     1181                              // has been encoded in a five- or six- byte sequence.

     1182                              // Move the cursor back to its original position then advance it to the position it should really be at

     1183                              $pos -= $utf_len;

     1184                              $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1185   

     1186                              if (!empty($utf_sort))

     1187                              {

     1188                                  ksort($utf_sort);

     1189   

     1190                                  foreach ($utf_sort as $utf_chars)

     1191                                  {

     1192                                      $tmp .= implode('', $utf_chars);

     1193                                  }

     1194                                  $utf_sort = array();

     1195                              }

     1196   

     1197                              // Add a replacement char then another replacement char for every trailing byte.

     1198                              //

     1199                              // @todo I'm not entirely sure that's how we're supposed to mark invalidated byte sequences, check this

     1200                              $spn = strspn($str, UTF8_TRAILING_BYTES, ++$pos);

     1201                              $tmp .= str_repeat(UTF8_REPLACEMENT, $spn + 1);

     1202   

     1203                              $dump = $sort = 0;

     1204   

     1205                              $pos += $spn;

     1206                              $tmp_pos = $pos;

     1207                              continue;

     1208                          }

     1209   

     1210                          if (isset($extra_check[$utf_char[0]]))

     1211                          {

     1212                              switch ($utf_char[0])

     1213                              {

     1214                                  // Note: 0xED is quite common in Korean

     1215                                  case "\xED":

     1216                                      if ($utf_char >= "\xED\xA0\x80")

     1217                                      {

     1218                                          // Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF)

     1219                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1220   

     1221                                          if (!empty($utf_sort))

     1222                                          {

     1223                                              ksort($utf_sort);

     1224   

     1225                                              foreach ($utf_sort as $utf_chars)

     1226                                              {

     1227                                                  $tmp .= implode('', $utf_chars);

     1228                                              }

     1229                                              $utf_sort = array();

     1230                                          }

     1231   

     1232                                          $tmp .= UTF8_REPLACEMENT;

     1233                                          $dump = $sort = 0;

     1234   

     1235                                          $tmp_pos = $starter_pos = $pos;

     1236                                          continue 2;

     1237                                      }

     1238                                  break;

     1239   

     1240                                  // Note: 0xEF is quite common in Japanese

     1241                                  case "\xEF":

     1242                                      if ($utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF")

     1243                                      {

     1244                                          // U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF)

     1245                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1246   

     1247                                          if (!empty($utf_sort))

     1248                                          {

     1249                                              ksort($utf_sort);

     1250   

     1251                                              foreach ($utf_sort as $utf_chars)

     1252                                              {

     1253                                                  $tmp .= implode('', $utf_chars);

     1254                                              }

     1255                                              $utf_sort = array();

     1256                                          }

     1257   

     1258                                          $tmp .= UTF8_REPLACEMENT;

     1259                                          $dump = $sort = 0;

     1260   

     1261                                          $tmp_pos = $starter_pos = $pos;

     1262                                          continue 2;

     1263                                      }

     1264                                  break;

     1265   

     1266                                  case "\xC0":

     1267                                  case "\xC1":

     1268                                      if ($utf_char <= "\xC1\xBF")

     1269                                      {

     1270                                          // Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char

     1271                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1272   

     1273                                          if (!empty($utf_sort))

     1274                                          {

     1275                                              ksort($utf_sort);

     1276   

     1277                                              foreach ($utf_sort as $utf_chars)

     1278                                              {

     1279                                                  $tmp .= implode('', $utf_chars);

     1280                                              }

     1281                                              $utf_sort = array();

     1282                                          }

     1283   

     1284                                          $tmp .= UTF8_REPLACEMENT;

     1285                                          $dump = $sort = 0;

     1286   

     1287                                          $tmp_pos = $starter_pos = $pos;

     1288                                          continue 2;

     1289                                      }

     1290                                  break;

     1291   

     1292                                  case "\xE0":

     1293                                      if ($utf_char <= "\xE0\x9F\xBF")

     1294                                      {

     1295                                          // Unicode char U+0000..U+07FF encoded in 3 bytes

     1296                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1297   

     1298                                          if (!empty($utf_sort))

     1299                                          {

     1300                                              ksort($utf_sort);

     1301   

     1302                                              foreach ($utf_sort as $utf_chars)

     1303                                              {

     1304                                                  $tmp .= implode('', $utf_chars);

     1305                                              }

     1306                                              $utf_sort = array();

     1307                                          }

     1308   

     1309                                          $tmp .= UTF8_REPLACEMENT;

     1310                                          $dump = $sort = 0;

     1311   

     1312                                          $tmp_pos = $starter_pos = $pos;

     1313                                          continue 2;

     1314                                      }

     1315                                  break;

     1316   

     1317                                  case "\xF0":

     1318                                      if ($utf_char <= "\xF0\x8F\xBF\xBF")

     1319                                      {

     1320                                          // Unicode char U+0000..U+FFFF encoded in 4 bytes

     1321                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1322   

     1323                                          if (!empty($utf_sort))

     1324                                          {

     1325                                              ksort($utf_sort);

     1326   

     1327                                              foreach ($utf_sort as $utf_chars)

     1328                                              {

     1329                                                  $tmp .= implode('', $utf_chars);

     1330                                              }

     1331                                              $utf_sort = array();

     1332                                          }

     1333   

     1334                                          $tmp .= UTF8_REPLACEMENT;

     1335                                          $dump = $sort = 0;

     1336   

     1337                                          $tmp_pos = $starter_pos = $pos;

     1338                                          continue 2;

     1339                                      }

     1340                                  break;

     1341   

     1342                                  default:

     1343                                      if ($utf_char > UTF8_MAX)

     1344                                      {

     1345                                          // Out of the Unicode range

     1346                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1347   

     1348                                          if (!empty($utf_sort))

     1349                                          {

     1350                                              ksort($utf_sort);

     1351   

     1352                                              foreach ($utf_sort as $utf_chars)

     1353                                              {

     1354                                                  $tmp .= implode('', $utf_chars);

     1355                                              }

     1356                                              $utf_sort = array();

     1357                                          }

     1358   

     1359                                          $tmp .= UTF8_REPLACEMENT;

     1360                                          $dump = $sort = 0;

     1361   

     1362                                          $tmp_pos = $starter_pos = $pos;

     1363                                          continue 2;

     1364                                      }

     1365                                  break;

     1366                              }

     1367                          }

     1368                      }

     1369                      else

     1370                      {

     1371                          // Hangul syllable

     1372                          $idx = (((ord($utf_char[0]) & 0x0F) << 12) | ((ord($utf_char[1]) & 0x3F) << 6) | (ord($utf_char[2]) & 0x3F)) - UNICODE_HANGUL_SBASE;

     1373   

     1374                          // LIndex can only range from 0 to 18, therefore it cannot influence the first two bytes of the L Jamo, which allows us to hardcode them (based on LBase).

     1375                          //

     1376                          // The same goes for VIndex, but for TIndex there's a catch: the value of the third byte could exceed 0xBF and we would have to increment the second byte

     1377                          if ($t_index = $idx % UNICODE_HANGUL_TCOUNT)

     1378                          {

     1379                              if ($t_index < 25)

     1380                              {

     1381                                  $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x86\x00";

     1382                                  $utf_char[8] = chr(0xA7 + $t_index);

     1383                              }

     1384                              else

     1385                              {

     1386                                  $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x87\x00";

     1387                                  $utf_char[8] = chr(0x67 + $t_index);

     1388                              }

     1389                          }

     1390                          else

     1391                          {

     1392                              $utf_char = "\xE1\x84\x00\xE1\x85\x00";

     1393                          }

     1394   

     1395                          $utf_char[2] = chr(0x80 + (int) ($idx / UNICODE_HANGUL_NCOUNT));

     1396                          $utf_char[5] = chr(0xA1 + (int) (($idx % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT));

     1397   

     1398                          // Just like other decompositions, the resulting Jamos must be dumped to the tmp string

     1399                          $dump = 1;

     1400                      }

     1401   

     1402                      // Do we need to dump stuff to the tmp string?

     1403                      if ($dump)

     1404                      {

     1405                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1406   

     1407                          // Dump combiners

     1408                          if (!empty($utf_sort))

     1409                          {

     1410                              if ($sort)

     1411                              {

     1412                                  ksort($utf_sort);

     1413                              }

     1414   

     1415                              foreach ($utf_sort as $utf_chars)

     1416                              {

     1417                                  $tmp .= implode('', $utf_chars);

     1418                              }

     1419                          }

     1420   

     1421                          $tmp .= $utf_char;

     1422                          $dump = $sort = 0;

     1423                          $tmp_pos = $pos;

     1424                      }

     1425   

     1426                      $last_cc = 0;

     1427                      $utf_sort = array();

     1428                      $starter_pos = $pos;

     1429                  }

     1430              }

     1431              else

     1432              {

     1433                  // ASCII char, which happens to be a starter (as any other ASCII char)

     1434                  if ($dump)

     1435                  {

     1436                      $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1437   

     1438                      // Dump combiners

     1439                      if (!empty($utf_sort))

     1440                      {

     1441                          if ($sort)

     1442                          {

     1443                              ksort($utf_sort);

     1444                          }

     1445   

     1446                          foreach ($utf_sort as $utf_chars)

     1447                          {

     1448                              $tmp .= implode('', $utf_chars);

     1449                          }

     1450                      }

     1451   

     1452                      $tmp .= $str[$pos];

     1453                      $dump = $sort = 0;

     1454                      $tmp_pos = ++$pos;

     1455   

     1456                      $pos += strspn($str, UTF8_ASCII_RANGE, $pos);

     1457                  }

     1458                  else

     1459                  {

     1460                      $pos += strspn($str, UTF8_ASCII_RANGE, ++$pos);

     1461                  }

     1462   

     1463                  $last_cc = 0;

     1464                  $utf_sort = array();

     1465                  $starter_pos = $pos;

     1466              }

     1467          }

     1468          while ($pos < $len);

     1469   

     1470          // Now is time to return the string

     1471          if ($dump)

     1472          {

     1473              $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1474   

     1475              // Dump combiners

     1476              if (!empty($utf_sort))

     1477              {

     1478                  if ($sort)

     1479                  {

     1480                      ksort($utf_sort);

     1481                  }

     1482   

     1483                  foreach ($utf_sort as $utf_chars)

     1484                  {

     1485                      $tmp .= implode('', $utf_chars);

     1486                  }

     1487              }

     1488   

     1489              return $tmp;

     1490          }

     1491          else if ($tmp_pos)

     1492          {

     1493              // If the $tmp_pos cursor was moved then at least one character was not in normal form. Replace $str with the fixed version

     1494              if ($tmp_pos == $len)

     1495              {

     1496                  // The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str

     1497                  return $tmp;

     1498              }

     1499              else

     1500              {

     1501                  // The rightmost chunk of $str has not been appended to $tmp yet

     1502                  return $tmp . substr($str, $tmp_pos);

     1503              }

     1504          }

     1505   

     1506          // The string was already in normal form

     1507          return $str;

     1508      }

     1509  }

     1510

Verzeichnisstruktur phpBB-3.1.0

Zuletzt modifiziert: 09.10.2024, 12:52 - Dateigröße: 41.87 KiB

utf_normalizer.php


     0001  <?php

     0002  /**

     0003  *

     0004  * This file is part of the phpBB Forum Software package.

     0005  *

     0006  * @copyright (c) phpBB Limited <https://www.phpbb.com>

     0007  * @license GNU General Public License, version 2 (GPL-2.0)

     0008  *

     0009  * For full copyright and license information, please see

     0010  * the docs/CREDITS.txt file.

     0011  *

     0012  */

     0013   

     0014  /**

     0015  */

     0016  if (!defined('IN_PHPBB'))

     0017  {

     0018      exit;

     0019  }

     0020   

     0021  /**

     0022  * Some Unicode characters encoded in UTF-8

     0023  *

     0024  * Preserved for compatibility

     0025  */

     0026  define('UTF8_REPLACEMENT', "\xEF\xBF\xBD");

     0027  define('UTF8_MAX', "\xF4\x8F\xBF\xBF");

     0028  define('UTF8_FFFE', "\xEF\xBF\xBE");

     0029  define('UTF8_FFFF', "\xEF\xBF\xBF");

     0030  define('UTF8_SURROGATE_FIRST', "\xED\xA0\x80");

     0031  define('UTF8_SURROGATE_LAST', "\xED\xBF\xBF");

     0032  define('UTF8_HANGUL_FIRST', "\xEA\xB0\x80");

     0033  define('UTF8_HANGUL_LAST', "\xED\x9E\xA3");

     0034   

     0035  define('UTF8_CJK_FIRST', "\xE4\xB8\x80");

     0036  define('UTF8_CJK_LAST', "\xE9\xBE\xBB");

     0037  define('UTF8_CJK_B_FIRST', "\xF0\xA0\x80\x80");

     0038  define('UTF8_CJK_B_LAST', "\xF0\xAA\x9B\x96");

     0039   

     0040  // Unset global variables

     0041  unset($GLOBALS['utf_jamo_index'], $GLOBALS['utf_jamo_type'], $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_combining_class'], $GLOBALS['utf_canonical_comp'], $GLOBALS['utf_canonical_decomp'], $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);

     0042   

     0043  // NFC_QC and NFKC_QC values

     0044  define('UNICODE_QC_MAYBE', 0);

     0045  define('UNICODE_QC_NO', 1);

     0046   

     0047  // Contains all the ASCII characters appearing in UTF-8, sorted by frequency

     0048  define('UTF8_ASCII_RANGE', "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F");

     0049   

     0050  // Contains all the tail bytes that can appear in the composition of a UTF-8 char

     0051  define('UTF8_TRAILING_BYTES', "\xA9\xA0\xA8\x80\xAA\x99\xA7\xBB\xAB\x89\x94\x82\xB4\xA2\xAE\x83\xB0\xB9\xB8\x93\xAF\xBC\xB3\x81\xA4\xB2\x9C\xA1\xB5\xBE\xBD\xBA\x98\xAD\xB1\x84\x95\xA6\xB6\x88\x8D\x90\xB7\xBF\x92\x85\xA5\x97\x8C\x86\xA3\x8E\x9F\x8F\x87\x91\x9D\xAC\x9E\x8B\x96\x9B\x8A\x9A");

     0052   

     0053  // Constants used by the Hangul [de]composition algorithms

     0054  define('UNICODE_HANGUL_SBASE', 0xAC00);

     0055  define('UNICODE_HANGUL_LBASE', 0x1100);

     0056  define('UNICODE_HANGUL_VBASE', 0x1161);

     0057  define('UNICODE_HANGUL_TBASE', 0x11A7);

     0058  define('UNICODE_HANGUL_SCOUNT', 11172);

     0059  define('UNICODE_HANGUL_LCOUNT', 19);

     0060  define('UNICODE_HANGUL_VCOUNT', 21);

     0061  define('UNICODE_HANGUL_TCOUNT', 28);

     0062  define('UNICODE_HANGUL_NCOUNT', 588);

     0063  define('UNICODE_JAMO_L', 0);

     0064  define('UNICODE_JAMO_V', 1);

     0065  define('UNICODE_JAMO_T', 2);

     0066   

     0067  /**

     0068  * Unicode normalization routines

     0069  */

     0070  class utf_normalizer

     0071  {

     0072      /**

     0073      * Validate, cleanup and normalize a string

     0074      *

     0075      * The ultimate convenience function! Clean up invalid UTF-8 sequences,

     0076      * and convert to Normal Form C, canonical composition.

     0077      *

     0078      * @param    string    &$str    The dirty string

     0079      * @return    string            The same string, all shiny and cleaned-up

     0080      */

     0081      static function cleanup(&$str)

     0082      {

     0083          // The string below is the list of all autorized characters, sorted by frequency in latin text

     0084          $pos = strspn($str, "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x0D");

     0085          $len = strlen($str);

     0086   

     0087          if ($pos == $len)

     0088          {

     0089              // ASCII strings with no special chars return immediately

     0090              return;

     0091          }

     0092   

     0093          // Note: we do not check for $GLOBALS['utf_canonical_decomp']. It is assumed they are always loaded together

     0094          if (!isset($GLOBALS['utf_nfc_qc']))

     0095          {

     0096              global $phpbb_root_path, $phpEx;

     0097              include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);

     0098          }

     0099   

     0100          if (!isset($GLOBALS['utf_canonical_decomp']))

     0101          {

     0102              global $phpbb_root_path, $phpEx;

     0103              include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);

     0104          }

     0105   

     0106          // Replace any byte in the range 0x00..0x1F, except for \r, \n and \t

     0107          // We replace those characters with a 0xFF byte, which is illegal in UTF-8 and will in turn be replaced with a UTF replacement char

     0108          $str = strtr(

     0109              $str,

     0110              "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",

     0111              "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"

     0112          );

     0113   

     0114          $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);

     0115      }

     0116   

     0117      /**

     0118      * Validate and normalize a UTF string to NFC

     0119      *

     0120      * @param    string    &$str    Unchecked UTF string

     0121      * @return    string            The string, validated and in normal form

     0122      */

     0123      static function nfc(&$str)

     0124      {

     0125          $pos = strspn($str, UTF8_ASCII_RANGE);

     0126          $len = strlen($str);

     0127   

     0128          if ($pos == $len)

     0129          {

     0130              // ASCII strings return immediately

     0131              return;

     0132          }

     0133   

     0134          if (!isset($GLOBALS['utf_nfc_qc']))

     0135          {

     0136              global $phpbb_root_path, $phpEx;

     0137              include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);

     0138          }

     0139   

     0140          if (!isset($GLOBALS['utf_canonical_decomp']))

     0141          {

     0142              global $phpbb_root_path, $phpEx;

     0143              include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);

     0144          }

     0145   

     0146          $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);

     0147      }

     0148   

     0149      /**

     0150      * Validate and normalize a UTF string to NFKC

     0151      *

     0152      * @param    string    &$str    Unchecked UTF string

     0153      * @return    string            The string, validated and in normal form

     0154      */

     0155      static function nfkc(&$str)

     0156      {

     0157          $pos = strspn($str, UTF8_ASCII_RANGE);

     0158          $len = strlen($str);

     0159   

     0160          if ($pos == $len)

     0161          {

     0162              // ASCII strings return immediately

     0163              return;

     0164          }

     0165   

     0166          if (!isset($GLOBALS['utf_nfkc_qc']))

     0167          {

     0168              global $phpbb_root_path, $phpEx;

     0169              include($phpbb_root_path . 'includes/utf/data/utf_nfkc_qc.' . $phpEx);

     0170          }

     0171   

     0172          if (!isset($GLOBALS['utf_compatibility_decomp']))

     0173          {

     0174              global $phpbb_root_path, $phpEx;

     0175              include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);

     0176          }

     0177   

     0178          $str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);

     0179      }

     0180   

     0181      /**

     0182      * Validate and normalize a UTF string to NFD

     0183      *

     0184      * @param    string    &$str    Unchecked UTF string

     0185      * @return    string            The string, validated and in normal form

     0186      */

     0187      static function nfd(&$str)

     0188      {

     0189          $pos = strspn($str, UTF8_ASCII_RANGE);

     0190          $len = strlen($str);

     0191   

     0192          if ($pos == $len)

     0193          {

     0194              // ASCII strings return immediately

     0195              return;

     0196          }

     0197   

     0198          if (!isset($GLOBALS['utf_canonical_decomp']))

     0199          {

     0200              global $phpbb_root_path, $phpEx;

     0201              include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);

     0202          }

     0203   

     0204          $str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_canonical_decomp']);

     0205      }

     0206   

     0207      /**

     0208      * Validate and normalize a UTF string to NFKD

     0209      *

     0210      * @param    string    &$str    Unchecked UTF string

     0211      * @return    string            The string, validated and in normal form

     0212      */

     0213      static function nfkd(&$str)

     0214      {

     0215          $pos = strspn($str, UTF8_ASCII_RANGE);

     0216          $len = strlen($str);

     0217   

     0218          if ($pos == $len)

     0219          {

     0220              // ASCII strings return immediately

     0221              return;

     0222          }

     0223   

     0224          if (!isset($GLOBALS['utf_compatibility_decomp']))

     0225          {

     0226              global $phpbb_root_path, $phpEx;

     0227              include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);

     0228          }

     0229   

     0230          $str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_compatibility_decomp']);

     0231      }

     0232   

     0233   

     0234      /**

     0235      * Recompose a UTF string

     0236      *

     0237      * @param    string    $str            Unchecked UTF string

     0238      * @param    integer    $pos            Position of the first UTF char (in bytes)

     0239      * @param    integer    $len            Length of the string (in bytes)

     0240      * @param    array    &$qc            Quick-check array, passed by reference but never modified

     0241      * @param    array    &$decomp_map    Decomposition mapping, passed by reference but never modified

     0242      * @return    string                    The string, validated and recomposed

     0243      *

     0244      * @access    private

     0245      */

     0246      static function recompose($str, $pos, $len, &$qc, &$decomp_map)

     0247      {

     0248          global $utf_combining_class, $utf_canonical_comp, $utf_jamo_type, $utf_jamo_index;

     0249   

     0250          // Load some commonly-used tables

     0251          if (!isset($utf_jamo_index, $utf_jamo_type, $utf_combining_class))

     0252          {

     0253              global $phpbb_root_path, $phpEx;

     0254              include($phpbb_root_path . 'includes/utf/data/utf_normalizer_common.' . $phpEx);

     0255          }

     0256   

     0257          // Load the canonical composition table

     0258          if (!isset($utf_canonical_comp))

     0259          {

     0260              global $phpbb_root_path, $phpEx;

     0261              include($phpbb_root_path . 'includes/utf/data/utf_canonical_comp.' . $phpEx);

     0262          }

     0263   

     0264          // Buffer the last ASCII char before the UTF-8 stuff if applicable

     0265          $tmp = '';

     0266          $i = $tmp_pos = $last_cc = 0;

     0267   

     0268          $buffer = ($pos) ? array(++$i => $str[$pos - 1]) : array();

     0269   

     0270          // UTF char length array

     0271          // This array is used to determine the length of a UTF character.

     0272          // Be $c the result of ($str[$pos] & "\xF0") --where $str is the string we're operating on and $pos

     0273          // the position of the cursor--, if $utf_len_mask[$c] does not exist, the byte is an ASCII char.

     0274          // Otherwise, if $utf_len_mask[$c] is greater than 0, we have a the leading byte of a multibyte character

     0275          // whose length is $utf_len_mask[$c] and if it is equal to 0, the byte is a trailing byte.

     0276          $utf_len_mask = array(

     0277              // Leading bytes masks

     0278              "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,

     0279              // Trailing bytes masks

     0280              "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0

     0281          );

     0282   

     0283          $extra_check = array(

     0284              "\xED" => 1, "\xEF" => 1, "\xC0" => 1, "\xC1" => 1, "\xE0" => 1, "\xF0" => 1,

     0285              "\xF4" => 1, "\xF5" => 1, "\xF6" => 1, "\xF7" => 1, "\xF8" => 1, "\xF9" => 1,

     0286              "\xFA" => 1, "\xFB" => 1, "\xFC" => 1, "\xFD" => 1, "\xFE" => 1, "\xFF" => 1

     0287          );

     0288   

     0289          $utf_validation_mask = array(

     0290              2    => "\xE0\xC0",

     0291              3    => "\xF0\xC0\xC0",

     0292              4    => "\xF8\xC0\xC0\xC0"

     0293          );

     0294   

     0295          $utf_validation_check = array(

     0296              2    => "\xC0\x80",

     0297              3    => "\xE0\x80\x80",

     0298              4    => "\xF0\x80\x80\x80"

     0299          );

     0300   

     0301          // Main loop

     0302          do

     0303          {

     0304              // STEP 0: Capture the current char and buffer it

     0305              $c = $str[$pos];

     0306              $c_mask = $c & "\xF0";

     0307   

     0308              if (isset($utf_len_mask[$c_mask]))

     0309              {

     0310                  // Byte at $pos is either a leading byte or a missplaced trailing byte

     0311                  if ($utf_len = $utf_len_mask[$c_mask])

     0312                  {

     0313                      // Capture the char

     0314                      $buffer[++$i & 7] = $utf_char = substr($str, $pos, $utf_len);

     0315   

     0316                      // Let's find out if a thorough check is needed

     0317                      if (isset($qc[$utf_char]))

     0318                      {

     0319                          // If the UTF char is in the qc array then it may not be in normal form. We do nothing here, the actual processing is below this "if" block

     0320                      }

     0321                      else if (isset($utf_combining_class[$utf_char]))

     0322                      {

     0323                          if ($utf_combining_class[$utf_char] < $last_cc)

     0324                          {

     0325                              // A combining character that is NOT canonically ordered

     0326                          }

     0327                          else

     0328                          {

     0329                              // A combining character that IS canonically ordered, skip to the next char

     0330                              $last_cc = $utf_combining_class[$utf_char];

     0331   

     0332                              $pos += $utf_len;

     0333                              continue;

     0334                          }

     0335                      }

     0336                      else

     0337                      {

     0338                          // At this point, $utf_char holds a UTF char that we know is not a NF[K]C_QC and is not a combining character.

     0339                          // It can be a singleton, a canonical composite, a replacement char or an even an ill-formed bunch of bytes. Let's find out

     0340                          $last_cc = 0;

     0341   

     0342                          // Check that we have the correct number of trailing bytes

     0343                          if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len])

     0344                          {

     0345                              // Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char

     0346                              // has been encoded in a five- or six- byte sequence

     0347                              if ($utf_char[0] >= "\xF8")

     0348                              {

     0349                                  if ($utf_char[0] < "\xFC")

     0350                                  {

     0351                                      $trailing_bytes = 4;

     0352                                  }

     0353                                  else if ($utf_char[0] > "\xFD")

     0354                                  {

     0355                                      $trailing_bytes = 0;

     0356                                  }

     0357                                  else

     0358                                  {

     0359                                      $trailing_bytes = 5;

     0360                                  }

     0361                              }

     0362                              else

     0363                              {

     0364                                  $trailing_bytes = $utf_len - 1;

     0365                              }

     0366   

     0367                              $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;

     0368                              $pos += strspn($str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes);

     0369                              $tmp_pos = $pos;

     0370   

     0371                              continue;

     0372                          }

     0373   

     0374                          if (isset($extra_check[$c]))

     0375                          {

     0376                              switch ($c)

     0377                              {

     0378                                  // Note: 0xED is quite common in Korean

     0379                                  case "\xED":

     0380                                      if ($utf_char >= "\xED\xA0\x80")

     0381                                      {

     0382                                          // Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF)

     0383                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;

     0384                                          $pos += $utf_len;

     0385                                          $tmp_pos = $pos;

     0386                                          continue 2;

     0387                                      }

     0388                                  break;

     0389   

     0390                                  // Note: 0xEF is quite common in Japanese

     0391                                  case "\xEF":

     0392                                      if ($utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF")

     0393                                      {

     0394                                          // U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF)

     0395                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;

     0396                                          $pos += $utf_len;

     0397                                          $tmp_pos = $pos;

     0398                                          continue 2;

     0399                                      }

     0400                                  break;

     0401   

     0402                                  case "\xC0":

     0403                                  case "\xC1":

     0404                                      if ($utf_char <= "\xC1\xBF")

     0405                                      {

     0406                                          // Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char

     0407                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;

     0408                                          $pos += $utf_len;

     0409                                          $tmp_pos = $pos;

     0410                                          continue 2;

     0411                                      }

     0412                                  break;

     0413   

     0414                                  case "\xE0":

     0415                                      if ($utf_char <= "\xE0\x9F\xBF")

     0416                                      {

     0417                                          // Unicode char U+0000..U+07FF encoded in 3 bytes

     0418                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;

     0419                                          $pos += $utf_len;

     0420                                          $tmp_pos = $pos;

     0421                                          continue 2;

     0422                                      }

     0423                                  break;

     0424   

     0425                                  case "\xF0":

     0426                                      if ($utf_char <= "\xF0\x8F\xBF\xBF")

     0427                                      {

     0428                                          // Unicode char U+0000..U+FFFF encoded in 4 bytes

     0429                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;

     0430                                          $pos += $utf_len;

     0431                                          $tmp_pos = $pos;

     0432                                          continue 2;

     0433                                      }

     0434                                  break;

     0435   

     0436                                  default:

     0437                                      // Five- and six- byte sequences do not need being checked for here anymore

     0438                                      if ($utf_char > UTF8_MAX)

     0439                                      {

     0440                                          // Out of the Unicode range

     0441                                          if ($utf_char[0] < "\xF8")

     0442                                          {

     0443                                              $trailing_bytes = 3;

     0444                                          }

     0445                                          else if ($utf_char[0] < "\xFC")

     0446                                          {

     0447                                              $trailing_bytes = 4;

     0448                                          }

     0449                                          else if ($utf_char[0] > "\xFD")

     0450                                          {

     0451                                              $trailing_bytes = 0;

     0452                                          }

     0453                                          else

     0454                                          {

     0455                                              $trailing_bytes = 5;

     0456                                          }

     0457   

     0458                                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . UTF8_REPLACEMENT;

     0459                                          $pos += strspn($str, UTF8_TRAILING_BYTES, ++$pos, $trailing_bytes);

     0460                                          $tmp_pos = $pos;

     0461                                          continue 2;

     0462                                      }

     0463                                  break;

     0464                              }

     0465                          }

     0466   

     0467                          // The char is a valid starter, move the cursor and go on

     0468                          $pos += $utf_len;

     0469                          continue;

     0470                      }

     0471                  }

     0472                  else

     0473                  {

     0474                      // A trailing byte came out of nowhere, we will advance the cursor and treat the this byte and all following trailing bytes as if

     0475                      // each of them was a Unicode replacement char

     0476                      $spn = strspn($str, UTF8_TRAILING_BYTES, $pos);

     0477                      $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn);

     0478   

     0479                      $pos += $spn;

     0480                      $tmp_pos = $pos;

     0481                      continue;

     0482                  }

     0483   

     0484                  // STEP 1: Decompose current char

     0485   

     0486                  // We have found a character that is either:

     0487                  //  - in the NFC_QC/NFKC_QC list

     0488                  //  - a non-starter char that is not canonically ordered

     0489                  //

     0490                  // We are going to capture the shortest UTF sequence that satisfies these two conditions:

     0491                  //

     0492                  //  1 - If the sequence does not start at the begginning of the string, it must begin with a starter,

     0493                  // and that starter must not have the NF[K]C_QC property equal to "MAYBE"

     0494                  //

     0495                  //  2 - If the sequence does not end at the end of the string, it must end with a non-starter and be

     0496                  // immediately followed by a starter that is not on the QC list

     0497                  //

     0498                  $utf_seq = array();

     0499                  $last_cc = 0;

     0500                  $lpos = $pos;

     0501                  $pos += $utf_len;

     0502   

     0503                  if (isset($decomp_map[$utf_char]))

     0504                  {

     0505                      $_pos = 0;

     0506                      $_len = strlen($decomp_map[$utf_char]);

     0507   

     0508                      do

     0509                      {

     0510                          $_utf_len =& $utf_len_mask[$decomp_map[$utf_char][$_pos] & "\xF0"];

     0511   

     0512                          if (isset($_utf_len))

     0513                          {

     0514                              $utf_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);

     0515                              $_pos += $_utf_len;

     0516                          }

     0517                          else

     0518                          {

     0519                              $utf_seq[] = $decomp_map[$utf_char][$_pos];

     0520                              ++$_pos;

     0521                          }

     0522                      }

     0523                      while ($_pos < $_len);

     0524                  }

     0525                  else

     0526                  {

     0527                      // The char is not decomposable

     0528                      $utf_seq = array($utf_char);

     0529                  }

     0530   

     0531                  // STEP 2: Capture the starter

     0532   

     0533                  // Check out the combining class of the first character of the UTF sequence

     0534                  $k = 0;

     0535                  if (isset($utf_combining_class[$utf_seq[0]]) || $qc[$utf_char] == UNICODE_QC_MAYBE)

     0536                  {

     0537                      // Not a starter, inspect previous characters

     0538                      // The last 8 characters are kept in a buffer so that we don't have to capture them everytime.

     0539                      // This is enough for all real-life strings but even if it wasn't, we can capture characters in backward mode,

     0540                      // although it is slower than this method.

     0541                      //

     0542                      // In the following loop, $j starts at the previous buffered character ($i - 1, because current character is

     0543                      // at offset $i) and process them in backward mode until we find a starter.

     0544                      //

     0545                      // $k is the index on each UTF character inside of our UTF sequence. At this time, $utf_seq contains one or more

     0546                      // characters numbered 0 to n. $k starts at 0 and for each char we prepend we pre-decrement it and for numbering

     0547                      $starter_found = 0;

     0548                      $j_min = max(1, $i - 7);

     0549   

     0550                      for ($j = $i - 1; $j >= $j_min && $lpos > $tmp_pos; --$j)

     0551                      {

     0552                          $utf_char = $buffer[$j & 7];

     0553                          $lpos -= strlen($utf_char);

     0554   

     0555                          if (isset($decomp_map[$utf_char]))

     0556                          {

     0557                              // The char is a composite, decompose for storage

     0558                              $decomp_seq = array();

     0559                              $_pos = 0;

     0560                              $_len = strlen($decomp_map[$utf_char]);

     0561   

     0562                              do

     0563                              {

     0564                                  $c = $decomp_map[$utf_char][$_pos];

     0565                                  $_utf_len =& $utf_len_mask[$c & "\xF0"];

     0566   

     0567                                  if (isset($_utf_len))

     0568                                  {

     0569                                      $decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);

     0570                                      $_pos += $_utf_len;

     0571                                  }

     0572                                  else

     0573                                  {

     0574                                      $decomp_seq[] = $c;

     0575                                      ++$_pos;

     0576                                  }

     0577                              }

     0578                              while ($_pos < $_len);

     0579   

     0580                              // Prepend the UTF sequence with our decomposed sequence

     0581                              if (isset($decomp_seq[1]))

     0582                              {

     0583                                  // The char expanded into several chars

     0584                                  $decomp_cnt = sizeof($decomp_seq);

     0585   

     0586                                  foreach ($decomp_seq as $decomp_i => $decomp_char)

     0587                                  {

     0588                                      $utf_seq[$k + $decomp_i - $decomp_cnt] = $decomp_char;

     0589                                  }

     0590                                  $k -= $decomp_cnt;

     0591                              }

     0592                              else

     0593                              {

     0594                                  // Decomposed to a single char, easier to prepend

     0595                                  $utf_seq[--$k] = $decomp_seq[0];

     0596                              }

     0597                          }

     0598                          else

     0599                          {

     0600                              $utf_seq[--$k] = $utf_char;

     0601                          }

     0602   

     0603                          if (!isset($utf_combining_class[$utf_seq[$k]]))

     0604                          {

     0605                              // We have found our starter

     0606                              $starter_found = 1;

     0607                              break;

     0608                          }

     0609                      }

     0610   

     0611                      if (!$starter_found && $lpos > $tmp_pos)

     0612                      {

     0613                          // The starter was not found in the buffer, let's rewind some more

     0614                          do

     0615                          {

     0616                              // $utf_len_mask contains the masks of both leading bytes and trailing bytes. If $utf_en > 0 then it's a leading byte, otherwise it's a trailing byte.

     0617                              $c = $str[--$lpos];

     0618                              $c_mask = $c & "\xF0";

     0619   

     0620                              if (isset($utf_len_mask[$c_mask]))

     0621                              {

     0622                                  // UTF byte

     0623                                  if ($utf_len = $utf_len_mask[$c_mask])

     0624                                  {

     0625                                      // UTF *leading* byte

     0626                                      $utf_char = substr($str, $lpos, $utf_len);

     0627   

     0628                                      if (isset($decomp_map[$utf_char]))

     0629                                      {

     0630                                          // Decompose the character

     0631                                          $decomp_seq = array();

     0632                                          $_pos = 0;

     0633                                          $_len = strlen($decomp_map[$utf_char]);

     0634   

     0635                                          do

     0636                                          {

     0637                                              $c = $decomp_map[$utf_char][$_pos];

     0638                                              $_utf_len =& $utf_len_mask[$c & "\xF0"];

     0639   

     0640                                              if (isset($_utf_len))

     0641                                              {

     0642                                                  $decomp_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);

     0643                                                  $_pos += $_utf_len;

     0644                                              }

     0645                                              else

     0646                                              {

     0647                                                  $decomp_seq[] = $c;

     0648                                                  ++$_pos;

     0649                                              }

     0650                                          }

     0651                                          while ($_pos < $_len);

     0652   

     0653                                          // Prepend the UTF sequence with our decomposed sequence

     0654                                          if (isset($decomp_seq[1]))

     0655                                          {

     0656                                              // The char expanded into several chars

     0657                                              $decomp_cnt = sizeof($decomp_seq);

     0658                                              foreach ($decomp_seq as $decomp_i => $utf_char)

     0659                                              {

     0660                                                  $utf_seq[$k + $decomp_i - $decomp_cnt] = $utf_char;

     0661                                              }

     0662                                              $k -= $decomp_cnt;

     0663                                          }

     0664                                          else

     0665                                          {

     0666                                              // Decomposed to a single char, easier to prepend

     0667                                              $utf_seq[--$k] = $decomp_seq[0];

     0668                                          }

     0669                                      }

     0670                                      else

     0671                                      {

     0672                                          $utf_seq[--$k] = $utf_char;

     0673                                      }

     0674                                  }

     0675                              }

     0676                              else

     0677                              {

     0678                                  // ASCII char

     0679                                  $utf_seq[--$k] = $c;

     0680                              }

     0681                          }

     0682                          while ($lpos > $tmp_pos);

     0683                      }

     0684                  }

     0685   

     0686                  // STEP 3: Capture following combining modifiers

     0687   

     0688                  while ($pos < $len)

     0689                  {

     0690                      $c_mask = $str[$pos] & "\xF0";

     0691   

     0692                      if (isset($utf_len_mask[$c_mask]))

     0693                      {

     0694                          if ($utf_len = $utf_len_mask[$c_mask])

     0695                          {

     0696                              $utf_char = substr($str, $pos, $utf_len);

     0697                          }

     0698                          else

     0699                          {

     0700                              // A trailing byte came out of nowhere

     0701                              // Trailing bytes are replaced with Unicode replacement chars, we will just ignore it for now, break out of the loop

     0702                              // as if it was a starter (replacement chars ARE starters) and let the next loop replace it

     0703                              break;

     0704                          }

     0705   

     0706                          if (isset($utf_combining_class[$utf_char]) || isset($qc[$utf_char]))

     0707                          {

     0708                              // Combining character, add it to the sequence and move the cursor

     0709                              if (isset($decomp_map[$utf_char]))

     0710                              {

     0711                                  // Decompose the character

     0712                                  $_pos = 0;

     0713                                  $_len = strlen($decomp_map[$utf_char]);

     0714   

     0715                                  do

     0716                                  {

     0717                                      $c = $decomp_map[$utf_char][$_pos];

     0718                                      $_utf_len =& $utf_len_mask[$c & "\xF0"];

     0719   

     0720                                      if (isset($_utf_len))

     0721                                      {

     0722                                          $utf_seq[] = substr($decomp_map[$utf_char], $_pos, $_utf_len);

     0723                                          $_pos += $_utf_len;

     0724                                      }

     0725                                      else

     0726                                      {

     0727                                          $utf_seq[] = $c;

     0728                                          ++$_pos;

     0729                                      }

     0730                                  }

     0731                                  while ($_pos < $_len);

     0732                              }

     0733                              else

     0734                              {

     0735                                  $utf_seq[] = $utf_char;

     0736                              }

     0737   

     0738                              $pos += $utf_len;

     0739                          }

     0740                          else

     0741                          {

     0742                              // Combining class 0 and no QC, break out of the loop

     0743                              // Note: we do not know if that character is valid. If it's not, the next iteration will replace it

     0744                              break;

     0745                          }

     0746                      }

     0747                      else

     0748                      {

     0749                          // ASCII chars are starters

     0750                          break;

     0751                      }

     0752                  }

     0753   

     0754                  // STEP 4: Sort and combine

     0755   

     0756                  // Here we sort...

     0757                  $k_max = $k + sizeof($utf_seq);

     0758   

     0759                  if (!$k && $k_max == 1)

     0760                  {

     0761                      // There is only one char in the UTF sequence, add it then jump to the next iteration of main loop

     0762                          // Note: the two commented lines below can be enabled under PHP5 for a very small performance gain in most cases

     0763  //                        if (substr_compare($str, $utf_seq[0], $lpos, $pos - $lpos))

     0764  //                        {

     0765                          $tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $utf_seq[0];

     0766                          $tmp_pos = $pos;

     0767  //                        }

     0768   

     0769                      continue;

     0770                  }

     0771   

     0772                  // ...there we combine

     0773                  if (isset($utf_combining_class[$utf_seq[$k]]))

     0774                  {

     0775                      $starter = $nf_seq = '';

     0776                  }

     0777                  else

     0778                  {

     0779                      $starter = $utf_seq[$k++];

     0780                      $nf_seq = '';

     0781                  }

     0782                  $utf_sort = array();

     0783   

     0784                  // We add an empty char at the end of the UTF char sequence. It will act as a starter and trigger the sort/combine routine

     0785                  // at the end of the string without altering it

     0786                  $utf_seq[] = '';

     0787   

     0788                  do

     0789                  {

     0790                      $utf_char = $utf_seq[$k++];

     0791   

     0792                      if (isset($utf_combining_class[$utf_char]))

     0793                      {

     0794                          $utf_sort[$utf_combining_class[$utf_char]][] = $utf_char;

     0795                      }

     0796                      else

     0797                      {

     0798                          if (empty($utf_sort))

     0799                          {

     0800                              // No combining characters... check for a composite of the two starters

     0801                              if (isset($utf_canonical_comp[$starter . $utf_char]))

     0802                              {

     0803                                  // Good ol' composite character

     0804                                  $starter = $utf_canonical_comp[$starter . $utf_char];

     0805                              }

     0806                              else if (isset($utf_jamo_type[$utf_char]))

     0807                              {

     0808                                  // Current char is a composable jamo

     0809                                  if (isset($utf_jamo_type[$starter]) && $utf_jamo_type[$starter] == UNICODE_JAMO_L && $utf_jamo_type[$utf_char] == UNICODE_JAMO_V)

     0810                                  {

     0811                                      // We have a L jamo followed by a V jamo, we are going to prefetch the next char to see if it's a T jamo

     0812                                      if (isset($utf_jamo_type[$utf_seq[$k]]) && $utf_jamo_type[$utf_seq[$k]] == UNICODE_JAMO_T)

     0813                                      {

     0814                                          // L+V+T jamos, combine to a LVT Hangul syllable ($k is incremented)

     0815                                          $cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char] + $utf_jamo_index[$utf_seq[$k]];

     0816                                          ++$k;

     0817                                      }

     0818                                      else

     0819                                      {

     0820                                          // L+V jamos, combine to a LV Hangul syllable

     0821                                          $cp = $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char];

     0822                                      }

     0823   

     0824                                      $starter = chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));

     0825                                  }

     0826                                  else

     0827                                  {

     0828                                      // Non-composable jamo, just add it to the sequence

     0829                                      $nf_seq .= $starter;

     0830                                      $starter = $utf_char;

     0831                                  }

     0832                              }

     0833                              else

     0834                              {

     0835                                  // No composite, just add the first starter to the sequence then continue with the other one

     0836                                  $nf_seq .= $starter;

     0837                                  $starter = $utf_char;

     0838                              }

     0839                          }

     0840                          else

     0841                          {

     0842                              ksort($utf_sort);

     0843   

     0844                              // For each class of combining characters

     0845                              foreach ($utf_sort as $cc => $utf_chars)

     0846                              {

     0847                                  $j = 0;

     0848   

     0849                                  do

     0850                                  {

     0851                                      // Look for a composite

     0852                                      if (isset($utf_canonical_comp[$starter . $utf_chars[$j]]))

     0853                                      {

     0854                                          // Found a composite, replace the starter

     0855                                          $starter = $utf_canonical_comp[$starter . $utf_chars[$j]];

     0856                                          unset($utf_sort[$cc][$j]);

     0857                                      }

     0858                                      else

     0859                                      {

     0860                                          // No composite, all following characters in that class are blocked

     0861                                          break;

     0862                                      }

     0863                                  }

     0864                                  while (isset($utf_sort[$cc][++$j]));

     0865                              }

     0866   

     0867                              // Add the starter to the normalized sequence, followed by non-starters in canonical order

     0868                              $nf_seq .= $starter;

     0869   

     0870                              foreach ($utf_sort as $utf_chars)

     0871                              {

     0872                                  if (!empty($utf_chars))

     0873                                  {

     0874                                      $nf_seq .= implode('', $utf_chars);

     0875                                  }

     0876                              }

     0877   

     0878                              // Reset the array and go on

     0879                              $utf_sort = array();

     0880                              $starter = $utf_char;

     0881                          }

     0882                      }

     0883                  }

     0884                  while ($k <= $k_max);

     0885   

     0886                  $tmp .= substr($str, $tmp_pos, $lpos - $tmp_pos) . $nf_seq;

     0887                  $tmp_pos = $pos;

     0888              }

     0889              else

     0890              {

     0891                  // Only a ASCII char can make the program get here

     0892                  //

     0893                  // First we skip the current byte with ++$pos, then we quickly skip following ASCII chars with strspn().

     0894                  //

     0895                  // The first two "if"'s here can be removed, with the consequences of being faster on latin text (lots of ASCII) and slower on

     0896                  // multi-byte text (where the only ASCII chars are spaces and punctuation)

     0897                  if (++$pos != $len)

     0898                  {

     0899                      if ($str[$pos] < "\x80")

     0900                      {

     0901                          $pos += strspn($str, UTF8_ASCII_RANGE, ++$pos);

     0902                          $buffer[++$i & 7] = $str[$pos - 1];

     0903                      }

     0904                      else

     0905                      {

     0906                          $buffer[++$i & 7] = $c;

     0907                      }

     0908                  }

     0909              }

     0910          }

     0911          while ($pos < $len);

     0912   

     0913          // Now is time to return the string

     0914          if ($tmp_pos)

     0915          {

     0916              // If the $tmp_pos cursor is not at the beggining of the string then at least one character was not in normal form. Replace $str with the fixed version

     0917              if ($tmp_pos == $len)

     0918              {

     0919                  // The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str

     0920                  return $tmp;

     0921              }

     0922              else

     0923              {

     0924                  // The rightmost chunk of $str has not been appended to $tmp yet

     0925                  return $tmp . substr($str, $tmp_pos);

     0926              }

     0927          }

     0928   

     0929          // The string was already in normal form

     0930          return $str;

     0931      }

     0932   

     0933      /**

     0934      * Decompose a UTF string

     0935      *

     0936      * @param    string    $str            UTF string

     0937      * @param    integer    $pos            Position of the first UTF char (in bytes)

     0938      * @param    integer    $len            Length of the string (in bytes)

     0939      * @param    array    &$decomp_map    Decomposition mapping, passed by reference but never modified

     0940      * @return    string                    The string, decomposed and sorted canonically

     0941      *

     0942      * @access    private

     0943      */

     0944      static function decompose($str, $pos, $len, &$decomp_map)

     0945      {

     0946          global $utf_combining_class;

     0947   

     0948          // Load some commonly-used tables

     0949          if (!isset($utf_combining_class))

     0950          {

     0951              global $phpbb_root_path, $phpEx;

     0952              include($phpbb_root_path . 'includes/utf/data/utf_normalizer_common.' . $phpEx);

     0953          }

     0954   

     0955          // UTF char length array

     0956          $utf_len_mask = array(

     0957              // Leading bytes masks

     0958              "\xC0" => 2, "\xD0" => 2, "\xE0" => 3, "\xF0" => 4,

     0959              // Trailing bytes masks

     0960              "\x80" => 0, "\x90" => 0, "\xA0" => 0, "\xB0" => 0

     0961          );

     0962   

     0963          // Some extra checks are triggered on the first byte of a UTF sequence

     0964          $extra_check = array(

     0965              "\xED" => 1, "\xEF" => 1, "\xC0" => 1, "\xC1" => 1, "\xE0" => 1, "\xF0" => 1,

     0966              "\xF4" => 1, "\xF5" => 1, "\xF6" => 1, "\xF7" => 1, "\xF8" => 1, "\xF9" => 1,

     0967              "\xFA" => 1, "\xFB" => 1, "\xFC" => 1, "\xFD" => 1, "\xFE" => 1, "\xFF" => 1

     0968          );

     0969   

     0970          // These masks are used to check if a UTF sequence is well formed. Here are the only 3 lengths we acknowledge:

     0971          //   - 2-byte: 110? ???? 10?? ????

     0972          //   - 3-byte: 1110 ???? 10?? ???? 10?? ????

     0973          //   - 4-byte: 1111 0??? 10?? ???? 10?? ???? 10?? ????

     0974          // Note that 5- and 6- byte sequences are automatically discarded

     0975          $utf_validation_mask = array(

     0976              2    => "\xE0\xC0",

     0977              3    => "\xF0\xC0\xC0",

     0978              4    => "\xF8\xC0\xC0\xC0"

     0979          );

     0980   

     0981          $utf_validation_check = array(

     0982              2    => "\xC0\x80",

     0983              3    => "\xE0\x80\x80",

     0984              4    => "\xF0\x80\x80\x80"

     0985          );

     0986   

     0987          $tmp = '';

     0988          $starter_pos = $pos;

     0989          $tmp_pos = $last_cc = $sort = $dump = 0;

     0990          $utf_sort = array();

     0991   

     0992          // Main loop

     0993          do

     0994          {

     0995              // STEP 0: Capture the current char

     0996   

     0997              $cur_mask = $str[$pos] & "\xF0";

     0998              if (isset($utf_len_mask[$cur_mask]))

     0999              {

     1000                  if ($utf_len = $utf_len_mask[$cur_mask])

     1001                  {

     1002                      // Multibyte char

     1003                      $utf_char = substr($str, $pos, $utf_len);

     1004                      $pos += $utf_len;

     1005                  }

     1006                  else

     1007                  {

     1008                      // A trailing byte came out of nowhere, we will treat it and all following trailing bytes as if each of them was a Unicode

     1009                      // replacement char and we will advance the cursor

     1010                      $spn = strspn($str, UTF8_TRAILING_BYTES, $pos);

     1011   

     1012                      if ($dump)

     1013                      {

     1014                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1015   

     1016                          // Dump combiners

     1017                          if (!empty($utf_sort))

     1018                          {

     1019                              if ($sort)

     1020                              {

     1021                                  ksort($utf_sort);

     1022                              }

     1023   

     1024                              foreach ($utf_sort as $utf_chars)

     1025                              {

     1026                                  $tmp .= implode('', $utf_chars);

     1027                              }

     1028                          }

     1029   

     1030                          $tmp .= str_repeat(UTF8_REPLACEMENT, $spn);

     1031                          $dump = $sort = 0;

     1032                      }

     1033                      else

     1034                      {

     1035                          $tmp .= substr($str, $tmp_pos, $pos - $tmp_pos) . str_repeat(UTF8_REPLACEMENT, $spn);

     1036                      }

     1037   

     1038                      $pos += $spn;

     1039                      $tmp_pos = $starter_pos = $pos;

     1040   

     1041                      $utf_sort = array();

     1042                      $last_cc = 0;

     1043   

     1044                      continue;

     1045                  }

     1046   

     1047                  // STEP 1: Decide what to do with current char

     1048   

     1049                  // Now, in that order:

     1050                  //  - check if that character is decomposable

     1051                  //  - check if that character is a non-starter

     1052                  //  - check if that character requires extra checks to be performed

     1053                  if (isset($decomp_map[$utf_char]))

     1054                  {

     1055                      // Decompose the char

     1056                      $_pos = 0;

     1057                      $_len = strlen($decomp_map[$utf_char]);

     1058   

     1059                      do

     1060                      {

     1061                          $c = $decomp_map[$utf_char][$_pos];

     1062                          $_utf_len =& $utf_len_mask[$c & "\xF0"];

     1063   

     1064                          if (isset($_utf_len))

     1065                          {

     1066                              $_utf_char = substr($decomp_map[$utf_char], $_pos, $_utf_len);

     1067                              $_pos += $_utf_len;

     1068   

     1069                              if (isset($utf_combining_class[$_utf_char]))

     1070                              {

     1071                                  // The character decomposed to a non-starter, buffer it for sorting

     1072                                  $utf_sort[$utf_combining_class[$_utf_char]][] = $_utf_char;

     1073   

     1074                                  if ($utf_combining_class[$_utf_char] < $last_cc)

     1075                                  {

     1076                                      // Not canonically ordered, will require sorting

     1077                                      $sort = $dump = 1;

     1078                                  }

     1079                                  else

     1080                                  {

     1081                                      $dump = 1;

     1082                                      $last_cc = $utf_combining_class[$_utf_char];

     1083                                  }

     1084                              }

     1085                              else

     1086                              {

     1087                                  // This character decomposition contains a starter, dump the buffer and continue

     1088                                  if ($dump)

     1089                                  {

     1090                                      $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1091   

     1092                                      // Dump combiners

     1093                                      if (!empty($utf_sort))

     1094                                      {

     1095                                          if ($sort)

     1096                                          {

     1097                                              ksort($utf_sort);

     1098                                          }

     1099   

     1100                                          foreach ($utf_sort as $utf_chars)

     1101                                          {

     1102                                              $tmp .= implode('', $utf_chars);

     1103                                          }

     1104                                      }

     1105   

     1106                                      $tmp .= $_utf_char;

     1107                                      $dump = $sort = 0;

     1108                                  }

     1109                                  else

     1110                                  {

     1111                                      $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos) . $_utf_char;

     1112                                  }

     1113   

     1114                                  $tmp_pos = $starter_pos = $pos;

     1115                                  $utf_sort = array();

     1116                                  $last_cc = 0;

     1117                              }

     1118                          }

     1119                          else

     1120                          {

     1121                              // This character decomposition contains an ASCII char, which is a starter. Dump the buffer and continue

     1122                              ++$_pos;

     1123   

     1124                              if ($dump)

     1125                              {

     1126                                  $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1127   

     1128                                  // Dump combiners

     1129                                  if (!empty($utf_sort))

     1130                                  {

     1131                                      if ($sort)

     1132                                      {

     1133                                          ksort($utf_sort);

     1134                                      }

     1135   

     1136                                      foreach ($utf_sort as $utf_chars)

     1137                                      {

     1138                                          $tmp .= implode('', $utf_chars);

     1139                                      }

     1140                                  }

     1141   

     1142                                  $tmp .= $c;

     1143                                  $dump = $sort = 0;

     1144                              }

     1145                              else

     1146                              {

     1147                                  $tmp .= substr($str, $tmp_pos, $pos - $utf_len - $tmp_pos) . $c;

     1148                              }

     1149   

     1150                              $tmp_pos = $starter_pos = $pos;

     1151                              $utf_sort = array();

     1152                              $last_cc = 0;

     1153                          }

     1154                      }

     1155                      while ($_pos < $_len);

     1156                  }

     1157                  else if (isset($utf_combining_class[$utf_char]))

     1158                  {

     1159                      // Combining character

     1160                      if ($utf_combining_class[$utf_char] < $last_cc)

     1161                      {

     1162                          // Not in canonical order

     1163                          $sort = $dump = 1;

     1164                      }

     1165                      else

     1166                      {

     1167                          $last_cc = $utf_combining_class[$utf_char];

     1168                      }

     1169   

     1170                      $utf_sort[$utf_combining_class[$utf_char]][] = $utf_char;

     1171                  }

     1172                  else

     1173                  {

     1174                      // Non-decomposable starter, check out if it's a Hangul syllable

     1175                      if ($utf_char < UTF8_HANGUL_FIRST || $utf_char > UTF8_HANGUL_LAST)

     1176                      {

     1177                          // Nope, regular UTF char, check that we have the correct number of trailing bytes

     1178                          if (($utf_char & $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len])

     1179                          {

     1180                              // Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char

     1181                              // has been encoded in a five- or six- byte sequence.

     1182                              // Move the cursor back to its original position then advance it to the position it should really be at

     1183                              $pos -= $utf_len;

     1184                              $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1185   

     1186                              if (!empty($utf_sort))

     1187                              {

     1188                                  ksort($utf_sort);

     1189   

     1190                                  foreach ($utf_sort as $utf_chars)

     1191                                  {

     1192                                      $tmp .= implode('', $utf_chars);

     1193                                  }

     1194                                  $utf_sort = array();

     1195                              }

     1196   

     1197                              // Add a replacement char then another replacement char for every trailing byte.

     1198                              //

     1199                              // @todo I'm not entirely sure that's how we're supposed to mark invalidated byte sequences, check this

     1200                              $spn = strspn($str, UTF8_TRAILING_BYTES, ++$pos);

     1201                              $tmp .= str_repeat(UTF8_REPLACEMENT, $spn + 1);

     1202   

     1203                              $dump = $sort = 0;

     1204   

     1205                              $pos += $spn;

     1206                              $tmp_pos = $pos;

     1207                              continue;

     1208                          }

     1209   

     1210                          if (isset($extra_check[$utf_char[0]]))

     1211                          {

     1212                              switch ($utf_char[0])

     1213                              {

     1214                                  // Note: 0xED is quite common in Korean

     1215                                  case "\xED":

     1216                                      if ($utf_char >= "\xED\xA0\x80")

     1217                                      {

     1218                                          // Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF)

     1219                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1220   

     1221                                          if (!empty($utf_sort))

     1222                                          {

     1223                                              ksort($utf_sort);

     1224   

     1225                                              foreach ($utf_sort as $utf_chars)

     1226                                              {

     1227                                                  $tmp .= implode('', $utf_chars);

     1228                                              }

     1229                                              $utf_sort = array();

     1230                                          }

     1231   

     1232                                          $tmp .= UTF8_REPLACEMENT;

     1233                                          $dump = $sort = 0;

     1234   

     1235                                          $tmp_pos = $starter_pos = $pos;

     1236                                          continue 2;

     1237                                      }

     1238                                  break;

     1239   

     1240                                  // Note: 0xEF is quite common in Japanese

     1241                                  case "\xEF":

     1242                                      if ($utf_char == "\xEF\xBF\xBE" || $utf_char == "\xEF\xBF\xBF")

     1243                                      {

     1244                                          // U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF)

     1245                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1246   

     1247                                          if (!empty($utf_sort))

     1248                                          {

     1249                                              ksort($utf_sort);

     1250   

     1251                                              foreach ($utf_sort as $utf_chars)

     1252                                              {

     1253                                                  $tmp .= implode('', $utf_chars);

     1254                                              }

     1255                                              $utf_sort = array();

     1256                                          }

     1257   

     1258                                          $tmp .= UTF8_REPLACEMENT;

     1259                                          $dump = $sort = 0;

     1260   

     1261                                          $tmp_pos = $starter_pos = $pos;

     1262                                          continue 2;

     1263                                      }

     1264                                  break;

     1265   

     1266                                  case "\xC0":

     1267                                  case "\xC1":

     1268                                      if ($utf_char <= "\xC1\xBF")

     1269                                      {

     1270                                          // Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char

     1271                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1272   

     1273                                          if (!empty($utf_sort))

     1274                                          {

     1275                                              ksort($utf_sort);

     1276   

     1277                                              foreach ($utf_sort as $utf_chars)

     1278                                              {

     1279                                                  $tmp .= implode('', $utf_chars);

     1280                                              }

     1281                                              $utf_sort = array();

     1282                                          }

     1283   

     1284                                          $tmp .= UTF8_REPLACEMENT;

     1285                                          $dump = $sort = 0;

     1286   

     1287                                          $tmp_pos = $starter_pos = $pos;

     1288                                          continue 2;

     1289                                      }

     1290                                  break;

     1291   

     1292                                  case "\xE0":

     1293                                      if ($utf_char <= "\xE0\x9F\xBF")

     1294                                      {

     1295                                          // Unicode char U+0000..U+07FF encoded in 3 bytes

     1296                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1297   

     1298                                          if (!empty($utf_sort))

     1299                                          {

     1300                                              ksort($utf_sort);

     1301   

     1302                                              foreach ($utf_sort as $utf_chars)

     1303                                              {

     1304                                                  $tmp .= implode('', $utf_chars);

     1305                                              }

     1306                                              $utf_sort = array();

     1307                                          }

     1308   

     1309                                          $tmp .= UTF8_REPLACEMENT;

     1310                                          $dump = $sort = 0;

     1311   

     1312                                          $tmp_pos = $starter_pos = $pos;

     1313                                          continue 2;

     1314                                      }

     1315                                  break;

     1316   

     1317                                  case "\xF0":

     1318                                      if ($utf_char <= "\xF0\x8F\xBF\xBF")

     1319                                      {

     1320                                          // Unicode char U+0000..U+FFFF encoded in 4 bytes

     1321                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1322   

     1323                                          if (!empty($utf_sort))

     1324                                          {

     1325                                              ksort($utf_sort);

     1326   

     1327                                              foreach ($utf_sort as $utf_chars)

     1328                                              {

     1329                                                  $tmp .= implode('', $utf_chars);

     1330                                              }

     1331                                              $utf_sort = array();

     1332                                          }

     1333   

     1334                                          $tmp .= UTF8_REPLACEMENT;

     1335                                          $dump = $sort = 0;

     1336   

     1337                                          $tmp_pos = $starter_pos = $pos;

     1338                                          continue 2;

     1339                                      }

     1340                                  break;

     1341   

     1342                                  default:

     1343                                      if ($utf_char > UTF8_MAX)

     1344                                      {

     1345                                          // Out of the Unicode range

     1346                                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1347   

     1348                                          if (!empty($utf_sort))

     1349                                          {

     1350                                              ksort($utf_sort);

     1351   

     1352                                              foreach ($utf_sort as $utf_chars)

     1353                                              {

     1354                                                  $tmp .= implode('', $utf_chars);

     1355                                              }

     1356                                              $utf_sort = array();

     1357                                          }

     1358   

     1359                                          $tmp .= UTF8_REPLACEMENT;

     1360                                          $dump = $sort = 0;

     1361   

     1362                                          $tmp_pos = $starter_pos = $pos;

     1363                                          continue 2;

     1364                                      }

     1365                                  break;

     1366                              }

     1367                          }

     1368                      }

     1369                      else

     1370                      {

     1371                          // Hangul syllable

     1372                          $idx = (((ord($utf_char[0]) & 0x0F) << 12) | ((ord($utf_char[1]) & 0x3F) << 6) | (ord($utf_char[2]) & 0x3F)) - UNICODE_HANGUL_SBASE;

     1373   

     1374                          // LIndex can only range from 0 to 18, therefore it cannot influence the first two bytes of the L Jamo, which allows us to hardcode them (based on LBase).

     1375                          //

     1376                          // The same goes for VIndex, but for TIndex there's a catch: the value of the third byte could exceed 0xBF and we would have to increment the second byte

     1377                          if ($t_index = $idx % UNICODE_HANGUL_TCOUNT)

     1378                          {

     1379                              if ($t_index < 25)

     1380                              {

     1381                                  $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x86\x00";

     1382                                  $utf_char[8] = chr(0xA7 + $t_index);

     1383                              }

     1384                              else

     1385                              {

     1386                                  $utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x87\x00";

     1387                                  $utf_char[8] = chr(0x67 + $t_index);

     1388                              }

     1389                          }

     1390                          else

     1391                          {

     1392                              $utf_char = "\xE1\x84\x00\xE1\x85\x00";

     1393                          }

     1394   

     1395                          $utf_char[2] = chr(0x80 + (int) ($idx / UNICODE_HANGUL_NCOUNT));

     1396                          $utf_char[5] = chr(0xA1 + (int) (($idx % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT));

     1397   

     1398                          // Just like other decompositions, the resulting Jamos must be dumped to the tmp string

     1399                          $dump = 1;

     1400                      }

     1401   

     1402                      // Do we need to dump stuff to the tmp string?

     1403                      if ($dump)

     1404                      {

     1405                          $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1406   

     1407                          // Dump combiners

     1408                          if (!empty($utf_sort))

     1409                          {

     1410                              if ($sort)

     1411                              {

     1412                                  ksort($utf_sort);

     1413                              }

     1414   

     1415                              foreach ($utf_sort as $utf_chars)

     1416                              {

     1417                                  $tmp .= implode('', $utf_chars);

     1418                              }

     1419                          }

     1420   

     1421                          $tmp .= $utf_char;

     1422                          $dump = $sort = 0;

     1423                          $tmp_pos = $pos;

     1424                      }

     1425   

     1426                      $last_cc = 0;

     1427                      $utf_sort = array();

     1428                      $starter_pos = $pos;

     1429                  }

     1430              }

     1431              else

     1432              {

     1433                  // ASCII char, which happens to be a starter (as any other ASCII char)

     1434                  if ($dump)

     1435                  {

     1436                      $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1437   

     1438                      // Dump combiners

     1439                      if (!empty($utf_sort))

     1440                      {

     1441                          if ($sort)

     1442                          {

     1443                              ksort($utf_sort);

     1444                          }

     1445   

     1446                          foreach ($utf_sort as $utf_chars)

     1447                          {

     1448                              $tmp .= implode('', $utf_chars);

     1449                          }

     1450                      }

     1451   

     1452                      $tmp .= $str[$pos];

     1453                      $dump = $sort = 0;

     1454                      $tmp_pos = ++$pos;

     1455   

     1456                      $pos += strspn($str, UTF8_ASCII_RANGE, $pos);

     1457                  }

     1458                  else

     1459                  {

     1460                      $pos += strspn($str, UTF8_ASCII_RANGE, ++$pos);

     1461                  }

     1462   

     1463                  $last_cc = 0;

     1464                  $utf_sort = array();

     1465                  $starter_pos = $pos;

     1466              }

     1467          }

     1468          while ($pos < $len);

     1469   

     1470          // Now is time to return the string

     1471          if ($dump)

     1472          {

     1473              $tmp .= substr($str, $tmp_pos, $starter_pos - $tmp_pos);

     1474   

     1475              // Dump combiners

     1476              if (!empty($utf_sort))

     1477              {

     1478                  if ($sort)

     1479                  {

     1480                      ksort($utf_sort);

     1481                  }

     1482   

     1483                  foreach ($utf_sort as $utf_chars)

     1484                  {

     1485                      $tmp .= implode('', $utf_chars);

     1486                  }

     1487              }

     1488   

     1489              return $tmp;

     1490          }

     1491          else if ($tmp_pos)

     1492          {

     1493              // If the $tmp_pos cursor was moved then at least one character was not in normal form. Replace $str with the fixed version

     1494              if ($tmp_pos == $len)

     1495              {

     1496                  // The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str

     1497                  return $tmp;

     1498              }

     1499              else

     1500              {

     1501                  // The rightmost chunk of $str has not been appended to $tmp yet

     1502                  return $tmp . substr($str, $tmp_pos);

     1503              }

     1504          }

     1505   

     1506          // The string was already in normal form

     1507          return $str;

     1508      }

     1509  }

     1510