wimCMS • Blick zurück von phpBB 1.0.0 bis heute

Verzeichnisstruktur phpBB-3.0.0

Veröffentlicht: 12.12.2007

So funktioniert es

Verzeichnis-Info phpBB-1.0.0 / auth.php	Auf das letzte Element klicken. Dies geht jeweils ein Schritt zurück
admin	Auf das Icon klicken, dies öffnet das Verzeichnis. Nochmal klicken schließt das Verzeichnis. Auf den Verzeichnisnamen klicken, dies zeigt nur das Verzeichnis mit Inhalt an
(Beispiel Datei-Icons)	Auf das Icon klicken um den Quellcode anzuzeigen

generate_utf_tables.php

Zuletzt modifiziert: 09.10.2024, 12:50 - Dateigröße: 12.40 KiB


     001  <?php

     002  /**

     003  *

     004  * @package phpBB3

     005  * @version $Id$

     006  * @copyright (c) 2005 phpBB Group

     007  * @license http://opensource.org/licenses/gpl-license.php GNU Public License

     008  *

     009  */

     010   

     011  if (php_sapi_name() != 'cli')

     012  {

     013      die("This program must be run from the command line.\n");

     014  }

     015   

     016  //

     017  // Security message:

     018  //

     019  // This script is potentially dangerous.

     020  // Remove or comment the next line (die(".... ) to enable this script.

     021  // Do NOT FORGET to either remove this script or disable it after you have used it.

     022  //

     023  die("Please read the first lines of this script for instructions on how to enable it");

     024   

     025  set_time_limit(0);

     026   

     027  define('IN_PHPBB', true);

     028  $phpbb_root_path = '../';

     029  $phpEx = substr(strrchr(__FILE__, '.'), 1);

     030   

     031  echo "Checking for required files\n";

     032  download('http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt');

     033  download('http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt');

     034  download('http://www.unicode.org/Public/UNIDATA/UnicodeData.txt');

     035  echo "\n";

     036   

     037  require_once($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);

     038  $file_contents = array();

     039   

     040  /**

     041  * Generate some Hangul/Jamo stuff

     042  */

     043  echo "\nGenerating Hangul and Jamo tables\n";

     044  for ($i = 0; $i < UNICODE_HANGUL_LCOUNT; ++$i)

     045  {

     046      $utf_char = cp_to_utf(UNICODE_HANGUL_LBASE + $i);

     047      $file_contents['utf_normalizer_common']['utf_jamo_index'][$utf_char] = $i * UNICODE_HANGUL_VCOUNT * UNICODE_HANGUL_TCOUNT + UNICODE_HANGUL_SBASE;

     048      $file_contents['utf_normalizer_common']['utf_jamo_type'][$utf_char] = UNICODE_JAMO_L;

     049  }

     050   

     051  for ($i = 0; $i < UNICODE_HANGUL_VCOUNT; ++$i)

     052  {

     053      $utf_char = cp_to_utf(UNICODE_HANGUL_VBASE + $i);

     054      $file_contents['utf_normalizer_common']['utf_jamo_index'][$utf_char] = $i * UNICODE_HANGUL_TCOUNT;

     055      $file_contents['utf_normalizer_common']['utf_jamo_type'][$utf_char] = UNICODE_JAMO_V;

     056  }

     057   

     058  for ($i = 0; $i < UNICODE_HANGUL_TCOUNT; ++$i)

     059  {

     060      $utf_char = cp_to_utf(UNICODE_HANGUL_TBASE + $i);

     061      $file_contents['utf_normalizer_common']['utf_jamo_index'][$utf_char] = $i;

     062      $file_contents['utf_normalizer_common']['utf_jamo_type'][$utf_char] = UNICODE_JAMO_T;

     063  }

     064   

     065  /**

     066  * Load the CompositionExclusions table

     067  */

     068  echo "Loading CompositionExclusion\n";

     069  $fp = fopen('CompositionExclusions.txt', 'rt');

     070   

     071  $exclude = array();

     072  while (!feof($fp))

     073  {

     074      $line = fgets($fp, 1024);

     075   

     076      if (!strpos(' 0123456789ABCDEFabcdef', $line[0]))

     077      {

     078          continue;

     079      }

     080   

     081      $cp = strtok($line, ' ');

     082   

     083      if ($pos = strpos($cp, '..'))

     084      {

     085          $start = hexdec(substr($cp, 0, $pos));

     086          $end = hexdec(substr($cp, $pos + 2));

     087   

     088          for ($i = $start; $i < $end; ++$i)

     089          {

     090              $exclude[$i] = 1;

     091          }

     092      }

     093      else

     094      {

     095          $exclude[hexdec($cp)] = 1;

     096      }

     097  }

     098  fclose($fp);

     099   

     100  /**

     101  * Load QuickCheck tables

     102  */

     103  echo "Generating QuickCheck tables\n";

     104  $fp = fopen('DerivedNormalizationProps.txt', 'rt');

     105   

     106  while (!feof($fp))

     107  {

     108      $line = fgets($fp, 1024);

     109   

     110      if (!strpos(' 0123456789ABCDEFabcdef', $line[0]))

     111      {

     112          continue;

     113      }

     114   

     115      $p = array_map('trim', explode(';', strtok($line, '#')));

     116   

     117      /**

     118      * Capture only NFC_QC, NFKC_QC

     119      */

     120      if (!preg_match('#^NFK?C_QC$#', $p[1]))

     121      {

     122          continue;

     123      }

     124   

     125      if ($pos = strpos($p[0], '..'))

     126      {

     127          $start = hexdec(substr($p[0], 0, $pos));

     128          $end = hexdec(substr($p[0], $pos + 2));

     129      }

     130      else

     131      {

     132          $start = $end = hexdec($p[0]);

     133      }

     134   

     135      if ($start >= UTF8_HANGUL_FIRST && $end <= UTF8_HANGUL_LAST)

     136      {

     137          /**

     138          * We do not store Hangul syllables in the array

     139          */

     140          continue;

     141      }

     142   

     143      if ($p[2] == 'M')

     144      {

     145          $val = UNICODE_QC_MAYBE;

     146      }

     147      else

     148      {

     149          $val = UNICODE_QC_NO;

     150      }

     151   

     152      if ($p[1] == 'NFKC_QC')

     153      {

     154          $file = 'utf_nfkc_qc';

     155      }

     156      else

     157      {

     158          $file = 'utf_nfc_qc';

     159      }

     160   

     161      for ($i = $start; $i <= $end; ++$i)

     162      {

     163          /**

     164          * The vars have the same name as the file: $utf_nfc_qc is in utf_nfc_qc.php

     165          */

     166          $file_contents[$file][$file][cp_to_utf($i)] = $val;

     167      }

     168  }

     169  fclose($fp);

     170   

     171  /**

     172  * Do mappings

     173  */

     174  echo "Loading Unicode decomposition mappings\n";

     175  $fp = fopen($phpbb_root_path . 'develop/UnicodeData.txt', 'rt');

     176   

     177  $map = array();

     178  while (!feof($fp))

     179  {

     180      $p = explode(';', fgets($fp, 1024));

     181      $cp = hexdec($p[0]);

     182   

     183      if (!empty($p[3]))

     184      {

     185          /**

     186          * Store combining class > 0

     187          */

     188          $file_contents['utf_normalizer_common']['utf_combining_class'][cp_to_utf($cp)] = (int) $p[3];

     189      }

     190   

     191      if (!isset($p[5]) || !preg_match_all('#[0-9A-F]+#', strip_tags($p[5]), $m))

     192      {

     193          continue;

     194      }

     195   

     196      if (strpos($p[5], '>'))

     197      {

     198          $map['NFKD'][$cp] = implode(' ', array_map('hexdec', $m[0]));

     199      }

     200      else

     201      {

     202          $map['NFD'][$cp] = $map['NFKD'][$cp] = implode(' ', array_map('hexdec', $m[0]));

     203      }

     204  }

     205  fclose($fp);

     206   

     207  /**

     208  * Build the canonical composition table

     209  */

     210  echo "Generating the Canonical Composition table\n";

     211  foreach ($map['NFD'] as $cp => $decomp_seq)

     212  {

     213      if (!strpos($decomp_seq, ' ') || isset($exclude[$cp]))

     214      {

     215          /**

     216          * Singletons are excluded from canonical composition

     217          */

     218          continue;

     219      }

     220   

     221      $utf_seq = implode('', array_map('cp_to_utf', explode(' ', $decomp_seq)));

     222   

     223      if (!isset($file_contents['utf_canonical_comp']['utf_canonical_comp'][$utf_seq]))

     224      {

     225          $file_contents['utf_canonical_comp']['utf_canonical_comp'][$utf_seq] = cp_to_utf($cp);

     226      }

     227  }

     228   

     229  /**

     230  * Decompose the NF[K]D mappings recursively and prepare the file contents

     231  */

     232  echo "Generating the Canonical and Compatibility Decomposition tables\n\n";

     233  foreach ($map as $type => $decomp_map)

     234  {

     235      foreach ($decomp_map as $cp => $decomp_seq)

     236      {

     237          $decomp_map[$cp] = decompose($decomp_map, $decomp_seq);

     238      }

     239      unset($decomp_seq);

     240   

     241      if ($type == 'NFKD')

     242      {

     243          $file = 'utf_compatibility_decomp';

     244          $var = 'utf_compatibility_decomp';

     245      }

     246      else

     247      {

     248          $file = 'utf_canonical_decomp';

     249          $var = 'utf_canonical_decomp';

     250      }

     251   

     252      /**

     253      * Generate the corresponding file

     254      */

     255      foreach ($decomp_map as $cp => $decomp_seq)

     256      {

     257          $file_contents[$file][$var][cp_to_utf($cp)] = implode('', array_map('cp_to_utf', explode(' ', $decomp_seq)));

     258      }

     259  }

     260   

     261  /**

     262  * Generate and/or alter the files

     263  */

     264  foreach ($file_contents as $file => $contents)

     265  {

     266      /**

     267      * Generate a new file

     268      */

     269      echo "Writing to $file.$phpEx\n";

     270   

     271      if (!$fp = fopen($phpbb_root_path . 'includes/utf/data/' . $file . '.' . $phpEx, 'wb'))

     272      {

     273          trigger_error('Cannot open ' . $file . ' for write');

     274      }

     275   

     276      fwrite($fp, '<?php');

     277      foreach ($contents as $var => $val)

     278      {

     279          fwrite($fp, "\n\$GLOBALS[" . my_var_export($var) . ']=' . my_var_export($val) . ";");

     280      }

     281      fclose($fp);

     282  }

     283   

     284  echo "\n*** UTF-8 normalization tables done\n\n";

     285   

     286  /**

     287  * Now we'll generate the files needed by the search indexer

     288  */

     289  echo "Generating search indexer tables\n";

     290   

     291  $fp = fopen($phpbb_root_path . 'develop/UnicodeData.txt', 'rt');

     292   

     293  $map = array();

     294  while ($line = fgets($fp, 1024))

     295  {

     296      /**

     297      * The current line is split, $m[0] hold the codepoint in hexadecimal and

     298      * all other fields numbered as in http://www.unicode.org/Public/UNIDATA/UCD.html#UnicodeData.txt

     299      */

     300      $m = explode(';', $line);

     301   

     302      /**

     303      * @var    integer    $cp            Current char codepoint

     304      * @var    string    $utf_char    UTF-8 representation of current char

     305      */

     306      $cp = hexdec($m[0]);

     307      $utf_char = cp_to_utf($cp);

     308   

     309      /**

     310      * $m[2] holds the "General Category" of the character

     311      * @link http://www.unicode.org/Public/UNIDATA/UCD.html#General_Category_Values

     312      */

     313      switch ($m[2][0])

     314      {

     315          case 'L':

     316              /**

     317              * We allow all letters and map them to their lowercased counterpart on the fly

     318              */

     319              $map_to_hex = (isset($m[13][0])) ? $m[13] : $m[0];

     320   

     321              if (preg_match('#^LATIN.*(?:LETTER|LIGATURE) ([A-Z]{2}(?![A-Z]))$#', $m[1], $capture))

     322              {

     323                  /**

     324                  * Special hack for some latin ligatures. Using the name of a character

     325                  * is bad practice, but for now it works well enough.

     326                  *

     327                  * @todo Note that ligatures with combining marks such as U+01E2 are

     328                  * not supported at this time

     329                  */

     330                  $map[$cp] = strtolower($capture[1]);

     331              }

     332              else if (isset($m[13][0]))

     333              {

     334                  /**

     335                  * If the letter has a lowercased form, use it

     336                  */

     337                  $map[$cp] = hex_to_utf($m[13]);

     338              }

     339              else

     340              {

     341                  /**

     342                  * In all other cases, map the letter to itself

     343                  */

     344                  $map[$cp] = $utf_char;

     345              }

     346              break;

     347   

     348          case 'M':

     349              /**

     350              * We allow all marks, they are mapped to themselves

     351              */

     352              $map[$cp] = $utf_char;

     353              break;

     354   

     355          case 'N':

     356              /**

     357              * We allow all numbers, but we map them to their numeric value whenever

     358              * possible. The numeric value (field #8) is in ASCII already

     359              *

     360              * @todo Note that fractions such as U+00BD will be converted to something

     361              * like "1/2", with a slash. However, "1/2" entered in ASCII is converted

     362              * to "1 2". This will have to be fixed.

     363              */

     364              $map[$cp] = (isset($m[8][0])) ? $m[8] : $utf_char;

     365              break;

     366   

     367          default:

     368              /**

     369              * Everything else is ignored, skip to the next line

     370              */

     371              continue 2;

     372      }

     373  }

     374  fclose($fp);

     375   

     376  /**

     377  * Add some cheating

     378  */

     379  $cheats = array(

     380      '00DF'    =>    'ss',        #    German sharp S

     381      '00C5'    =>    'ae',        #    Capital A with diaeresis

     382      '00E4'    =>    'ae',        #    Small A with diaeresis

     383      '00D6'    =>    'oe',        #    Capital O with diaeresis

     384      '00F6'    =>    'oe',        #    Small O with diaeresis

     385      '00DC'    =>    'ue',        #    Capital U with diaeresis

     386      '00FC'    =>    'ue',        #    Small U with diaeresis

     387  );

     388   

     389  /**

     390  * Add our "cheat replacements" to the map

     391  */

     392  foreach ($cheats as $hex => $map_to)

     393  {

     394      $map[hexdec($hex)] = $map_to;

     395  }

     396   

     397  /**

     398  * Split the map into smaller blocks

     399  */

     400  $file_contents = array();

     401  foreach ($map as $cp => $map_to)

     402  {

     403      $file_contents[$cp >> 11][cp_to_utf($cp)] = $map_to;

     404  }

     405  unset($map);

     406   

     407  foreach ($file_contents as $idx => $contents)

     408  {

     409      echo "Writing to search_indexer_$idx.$phpEx\n";

     410      $fp = fopen($phpbb_root_path . 'includes/utf/data/search_indexer_' . $idx . '.' . $phpEx, 'wb');

     411      fwrite($fp, '<?php return ' . my_var_export($contents) . ';');

     412      fclose($fp);

     413  }

     414  echo "\n*** Search indexer tables done\n\n";

     415   

     416   

     417  die("\nAll done!\n");

     418   

     419   

     420  ////////////////////////////////////////////////////////////////////////////////

     421  //                             Internal functions                             //

     422  ////////////////////////////////////////////////////////////////////////////////

     423   

     424  /**

     425  * Decompose a sequence recusively

     426  *

     427  * @param    array    $decomp_map    Decomposition mapping, passed by reference

     428  * @param    string    $decomp_seq    Decomposition sequence as decimal codepoints separated with a space

     429  * @return    string                Decomposition sequence, fully decomposed

     430  */

     431  function decompose(&$decomp_map, $decomp_seq)

     432  {

     433      $ret = array();

     434      foreach (explode(' ', $decomp_seq) as $cp)

     435      {

     436          if (isset($decomp_map[$cp]))

     437          {

     438              $ret[] = decompose($decomp_map, $decomp_map[$cp]);

     439          }

     440          else

     441          {

     442              $ret[] = $cp;

     443          }

     444      }

     445   

     446      return implode(' ', $ret);

     447  }

     448   

     449   

     450  /**

     451  * Return a parsable string representation of a variable

     452  *

     453  * This is function is limited to array/strings/integers

     454  *

     455  * @param    mixed    $var        Variable

     456  * @return    string                PHP code representing the variable

     457  */

     458  function my_var_export($var)

     459  {

     460      if (is_array($var))

     461      {

     462          $lines = array();

     463   

     464          foreach ($var as $k => $v)

     465          {

     466              $lines[] = my_var_export($k) . '=>' . my_var_export($v);

     467          }

     468   

     469          return 'array(' . implode(',', $lines) . ')';

     470      }

     471      else if (is_string($var))

     472      {

     473          return "'" . str_replace(array('\\', "'"), array('\\\\', "\\'"), $var) . "'";

     474      }

     475      else

     476      {

     477          return $var;

     478      }

     479  }

     480   

     481  /**

     482  * Download a file to the develop/ dir

     483  *

     484  * @param    string    $url        URL of the file to download

     485  * @return    void

     486  */

     487  function download($url)

     488  {

     489      global $phpbb_root_path;

     490   

     491      if (file_exists($phpbb_root_path . 'develop/' . basename($url)))

     492      {

     493          return;

     494      }

     495   

     496      echo 'Downloading from ', $url, ' ';

     497   

     498      if (!$fpr = fopen($url, 'rb'))

     499      {

     500          die("Can't download from $url\nPlease download it yourself and put it in the develop/ dir, kthxbai");

     501      }

     502   

     503      if (!$fpw = fopen($phpbb_root_path . 'develop/' . basename($url), 'wb'))

     504      {

     505          die("Can't open develop/" . basename($url) . " for output... please check your permissions or something");

     506      }

     507   

     508      $i = 0;

     509      $chunk = 32768;

     510      $done = '';

     511   

     512      while (!feof($fpr))

     513      {

     514          $i += fwrite($fpw, fread($fpr, $chunk));

     515          echo str_repeat("\x08", strlen($done));

     516   

     517          $done = ($i >> 10) . ' KiB';

     518          echo $done;

     519      }

     520      fclose($fpr);

     521      fclose($fpw);

     522   

     523      echo "\n";

     524  }

     525   

     526  /**

     527  * Convert a codepoint in hexadecimal to a UTF-8 char

     528  *

     529  * @param    string    $hex        Codepoint, in hexadecimal

     530  * @return    string                UTF-8 char

     531  */

     532  function hex_to_utf($hex)

     533  {

     534      return cp_to_utf(hexdec($hex));

     535  }

     536   

     537  /**

     538  * Return a UTF string formed from a sequence of codepoints in hexadecimal

     539  *

     540  * @param    string    $seq        Sequence of codepoints, separated with a space

     541  * @return    string                UTF-8 string

     542  */

     543  function hexseq_to_utf($seq)

     544  {

     545      return implode('', array_map('hex_to_utf', explode(' ', $seq)));

     546  }

     547   

     548  /**

     549  * Convert a codepoint to a UTF-8 char

     550  *

     551  * @param    integer    $cp            Unicode codepoint

     552  * @return    string                UTF-8 string

     553  */

     554  function cp_to_utf($cp)

     555  {

     556      if ($cp > 0xFFFF)

     557      {

     558          return chr(0xF0 | ($cp >> 18)) . chr(0x80 | (($cp >> 12) & 0x3F)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));

     559      }

     560      else if ($cp > 0x7FF)

     561      {

     562          return chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));

     563      }

     564      else if ($cp > 0x7F)

     565      {

     566          return chr(0xC0 | ($cp >> 6)) . chr(0x80 | ($cp & 0x3F));

     567      }

     568      else

     569      {

     570          return chr($cp);

     571      }

     572  }

Verzeichnisstruktur phpBB-3.0.0

Zuletzt modifiziert: 09.10.2024, 12:50 - Dateigröße: 12.40 KiB

generate_utf_tables.php


     001  <?php

     002  /**

     003  *

     004  * @package phpBB3

     005  * @version $Id$

     006  * @copyright (c) 2005 phpBB Group

     007  * @license http://opensource.org/licenses/gpl-license.php GNU Public License

     008  *

     009  */

     010   

     011  if (php_sapi_name() != 'cli')

     012  {

     013      die("This program must be run from the command line.\n");

     014  }

     015   

     016  //

     017  // Security message:

     018  //

     019  // This script is potentially dangerous.

     020  // Remove or comment the next line (die(".... ) to enable this script.

     021  // Do NOT FORGET to either remove this script or disable it after you have used it.

     022  //

     023  die("Please read the first lines of this script for instructions on how to enable it");

     024   

     025  set_time_limit(0);

     026   

     027  define('IN_PHPBB', true);

     028  $phpbb_root_path = '../';

     029  $phpEx = substr(strrchr(__FILE__, '.'), 1);

     030   

     031  echo "Checking for required files\n";

     032  download('http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt');

     033  download('http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt');

     034  download('http://www.unicode.org/Public/UNIDATA/UnicodeData.txt');

     035  echo "\n";

     036   

     037  require_once($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);

     038  $file_contents = array();

     039   

     040  /**

     041  * Generate some Hangul/Jamo stuff

     042  */

     043  echo "\nGenerating Hangul and Jamo tables\n";

     044  for ($i = 0; $i < UNICODE_HANGUL_LCOUNT; ++$i)

     045  {

     046      $utf_char = cp_to_utf(UNICODE_HANGUL_LBASE + $i);

     047      $file_contents['utf_normalizer_common']['utf_jamo_index'][$utf_char] = $i * UNICODE_HANGUL_VCOUNT * UNICODE_HANGUL_TCOUNT + UNICODE_HANGUL_SBASE;

     048      $file_contents['utf_normalizer_common']['utf_jamo_type'][$utf_char] = UNICODE_JAMO_L;

     049  }

     050   

     051  for ($i = 0; $i < UNICODE_HANGUL_VCOUNT; ++$i)

     052  {

     053      $utf_char = cp_to_utf(UNICODE_HANGUL_VBASE + $i);

     054      $file_contents['utf_normalizer_common']['utf_jamo_index'][$utf_char] = $i * UNICODE_HANGUL_TCOUNT;

     055      $file_contents['utf_normalizer_common']['utf_jamo_type'][$utf_char] = UNICODE_JAMO_V;

     056  }

     057   

     058  for ($i = 0; $i < UNICODE_HANGUL_TCOUNT; ++$i)

     059  {

     060      $utf_char = cp_to_utf(UNICODE_HANGUL_TBASE + $i);

     061      $file_contents['utf_normalizer_common']['utf_jamo_index'][$utf_char] = $i;

     062      $file_contents['utf_normalizer_common']['utf_jamo_type'][$utf_char] = UNICODE_JAMO_T;

     063  }

     064   

     065  /**

     066  * Load the CompositionExclusions table

     067  */

     068  echo "Loading CompositionExclusion\n";

     069  $fp = fopen('CompositionExclusions.txt', 'rt');

     070   

     071  $exclude = array();

     072  while (!feof($fp))

     073  {

     074      $line = fgets($fp, 1024);

     075   

     076      if (!strpos(' 0123456789ABCDEFabcdef', $line[0]))

     077      {

     078          continue;

     079      }

     080   

     081      $cp = strtok($line, ' ');

     082   

     083      if ($pos = strpos($cp, '..'))

     084      {

     085          $start = hexdec(substr($cp, 0, $pos));

     086          $end = hexdec(substr($cp, $pos + 2));

     087   

     088          for ($i = $start; $i < $end; ++$i)

     089          {

     090              $exclude[$i] = 1;

     091          }

     092      }

     093      else

     094      {

     095          $exclude[hexdec($cp)] = 1;

     096      }

     097  }

     098  fclose($fp);

     099   

     100  /**

     101  * Load QuickCheck tables

     102  */

     103  echo "Generating QuickCheck tables\n";

     104  $fp = fopen('DerivedNormalizationProps.txt', 'rt');

     105   

     106  while (!feof($fp))

     107  {

     108      $line = fgets($fp, 1024);

     109   

     110      if (!strpos(' 0123456789ABCDEFabcdef', $line[0]))

     111      {

     112          continue;

     113      }

     114   

     115      $p = array_map('trim', explode(';', strtok($line, '#')));

     116   

     117      /**

     118      * Capture only NFC_QC, NFKC_QC

     119      */

     120      if (!preg_match('#^NFK?C_QC$#', $p[1]))

     121      {

     122          continue;

     123      }

     124   

     125      if ($pos = strpos($p[0], '..'))

     126      {

     127          $start = hexdec(substr($p[0], 0, $pos));

     128          $end = hexdec(substr($p[0], $pos + 2));

     129      }

     130      else

     131      {

     132          $start = $end = hexdec($p[0]);

     133      }

     134   

     135      if ($start >= UTF8_HANGUL_FIRST && $end <= UTF8_HANGUL_LAST)

     136      {

     137          /**

     138          * We do not store Hangul syllables in the array

     139          */

     140          continue;

     141      }

     142   

     143      if ($p[2] == 'M')

     144      {

     145          $val = UNICODE_QC_MAYBE;

     146      }

     147      else

     148      {

     149          $val = UNICODE_QC_NO;

     150      }

     151   

     152      if ($p[1] == 'NFKC_QC')

     153      {

     154          $file = 'utf_nfkc_qc';

     155      }

     156      else

     157      {

     158          $file = 'utf_nfc_qc';

     159      }

     160   

     161      for ($i = $start; $i <= $end; ++$i)

     162      {

     163          /**

     164          * The vars have the same name as the file: $utf_nfc_qc is in utf_nfc_qc.php

     165          */

     166          $file_contents[$file][$file][cp_to_utf($i)] = $val;

     167      }

     168  }

     169  fclose($fp);

     170   

     171  /**

     172  * Do mappings

     173  */

     174  echo "Loading Unicode decomposition mappings\n";

     175  $fp = fopen($phpbb_root_path . 'develop/UnicodeData.txt', 'rt');

     176   

     177  $map = array();

     178  while (!feof($fp))

     179  {

     180      $p = explode(';', fgets($fp, 1024));

     181      $cp = hexdec($p[0]);

     182   

     183      if (!empty($p[3]))

     184      {

     185          /**

     186          * Store combining class > 0

     187          */

     188          $file_contents['utf_normalizer_common']['utf_combining_class'][cp_to_utf($cp)] = (int) $p[3];

     189      }

     190   

     191      if (!isset($p[5]) || !preg_match_all('#[0-9A-F]+#', strip_tags($p[5]), $m))

     192      {

     193          continue;

     194      }

     195   

     196      if (strpos($p[5], '>'))

     197      {

     198          $map['NFKD'][$cp] = implode(' ', array_map('hexdec', $m[0]));

     199      }

     200      else

     201      {

     202          $map['NFD'][$cp] = $map['NFKD'][$cp] = implode(' ', array_map('hexdec', $m[0]));

     203      }

     204  }

     205  fclose($fp);

     206   

     207  /**

     208  * Build the canonical composition table

     209  */

     210  echo "Generating the Canonical Composition table\n";

     211  foreach ($map['NFD'] as $cp => $decomp_seq)

     212  {

     213      if (!strpos($decomp_seq, ' ') || isset($exclude[$cp]))

     214      {

     215          /**

     216          * Singletons are excluded from canonical composition

     217          */

     218          continue;

     219      }

     220   

     221      $utf_seq = implode('', array_map('cp_to_utf', explode(' ', $decomp_seq)));

     222   

     223      if (!isset($file_contents['utf_canonical_comp']['utf_canonical_comp'][$utf_seq]))

     224      {

     225          $file_contents['utf_canonical_comp']['utf_canonical_comp'][$utf_seq] = cp_to_utf($cp);

     226      }

     227  }

     228   

     229  /**

     230  * Decompose the NF[K]D mappings recursively and prepare the file contents

     231  */

     232  echo "Generating the Canonical and Compatibility Decomposition tables\n\n";

     233  foreach ($map as $type => $decomp_map)

     234  {

     235      foreach ($decomp_map as $cp => $decomp_seq)

     236      {

     237          $decomp_map[$cp] = decompose($decomp_map, $decomp_seq);

     238      }

     239      unset($decomp_seq);

     240   

     241      if ($type == 'NFKD')

     242      {

     243          $file = 'utf_compatibility_decomp';

     244          $var = 'utf_compatibility_decomp';

     245      }

     246      else

     247      {

     248          $file = 'utf_canonical_decomp';

     249          $var = 'utf_canonical_decomp';

     250      }

     251   

     252      /**

     253      * Generate the corresponding file

     254      */

     255      foreach ($decomp_map as $cp => $decomp_seq)

     256      {

     257          $file_contents[$file][$var][cp_to_utf($cp)] = implode('', array_map('cp_to_utf', explode(' ', $decomp_seq)));

     258      }

     259  }

     260   

     261  /**

     262  * Generate and/or alter the files

     263  */

     264  foreach ($file_contents as $file => $contents)

     265  {

     266      /**

     267      * Generate a new file

     268      */

     269      echo "Writing to $file.$phpEx\n";

     270   

     271      if (!$fp = fopen($phpbb_root_path . 'includes/utf/data/' . $file . '.' . $phpEx, 'wb'))

     272      {

     273          trigger_error('Cannot open ' . $file . ' for write');

     274      }

     275   

     276      fwrite($fp, '<?php');

     277      foreach ($contents as $var => $val)

     278      {

     279          fwrite($fp, "\n\$GLOBALS[" . my_var_export($var) . ']=' . my_var_export($val) . ";");

     280      }

     281      fclose($fp);

     282  }

     283   

     284  echo "\n*** UTF-8 normalization tables done\n\n";

     285   

     286  /**

     287  * Now we'll generate the files needed by the search indexer

     288  */

     289  echo "Generating search indexer tables\n";

     290   

     291  $fp = fopen($phpbb_root_path . 'develop/UnicodeData.txt', 'rt');

     292   

     293  $map = array();

     294  while ($line = fgets($fp, 1024))

     295  {

     296      /**

     297      * The current line is split, $m[0] hold the codepoint in hexadecimal and

     298      * all other fields numbered as in http://www.unicode.org/Public/UNIDATA/UCD.html#UnicodeData.txt

     299      */

     300      $m = explode(';', $line);

     301   

     302      /**

     303      * @var    integer    $cp            Current char codepoint

     304      * @var    string    $utf_char    UTF-8 representation of current char

     305      */

     306      $cp = hexdec($m[0]);

     307      $utf_char = cp_to_utf($cp);

     308   

     309      /**

     310      * $m[2] holds the "General Category" of the character

     311      * @link http://www.unicode.org/Public/UNIDATA/UCD.html#General_Category_Values

     312      */

     313      switch ($m[2][0])

     314      {

     315          case 'L':

     316              /**

     317              * We allow all letters and map them to their lowercased counterpart on the fly

     318              */

     319              $map_to_hex = (isset($m[13][0])) ? $m[13] : $m[0];

     320   

     321              if (preg_match('#^LATIN.*(?:LETTER|LIGATURE) ([A-Z]{2}(?![A-Z]))$#', $m[1], $capture))

     322              {

     323                  /**

     324                  * Special hack for some latin ligatures. Using the name of a character

     325                  * is bad practice, but for now it works well enough.

     326                  *

     327                  * @todo Note that ligatures with combining marks such as U+01E2 are

     328                  * not supported at this time

     329                  */

     330                  $map[$cp] = strtolower($capture[1]);

     331              }

     332              else if (isset($m[13][0]))

     333              {

     334                  /**

     335                  * If the letter has a lowercased form, use it

     336                  */

     337                  $map[$cp] = hex_to_utf($m[13]);

     338              }

     339              else

     340              {

     341                  /**

     342                  * In all other cases, map the letter to itself

     343                  */

     344                  $map[$cp] = $utf_char;

     345              }

     346              break;

     347   

     348          case 'M':

     349              /**

     350              * We allow all marks, they are mapped to themselves

     351              */

     352              $map[$cp] = $utf_char;

     353              break;

     354   

     355          case 'N':

     356              /**

     357              * We allow all numbers, but we map them to their numeric value whenever

     358              * possible. The numeric value (field #8) is in ASCII already

     359              *

     360              * @todo Note that fractions such as U+00BD will be converted to something

     361              * like "1/2", with a slash. However, "1/2" entered in ASCII is converted

     362              * to "1 2". This will have to be fixed.

     363              */

     364              $map[$cp] = (isset($m[8][0])) ? $m[8] : $utf_char;

     365              break;

     366   

     367          default:

     368              /**

     369              * Everything else is ignored, skip to the next line

     370              */

     371              continue 2;

     372      }

     373  }

     374  fclose($fp);

     375   

     376  /**

     377  * Add some cheating

     378  */

     379  $cheats = array(

     380      '00DF'    =>    'ss',        #    German sharp S

     381      '00C5'    =>    'ae',        #    Capital A with diaeresis

     382      '00E4'    =>    'ae',        #    Small A with diaeresis

     383      '00D6'    =>    'oe',        #    Capital O with diaeresis

     384      '00F6'    =>    'oe',        #    Small O with diaeresis

     385      '00DC'    =>    'ue',        #    Capital U with diaeresis

     386      '00FC'    =>    'ue',        #    Small U with diaeresis

     387  );

     388   

     389  /**

     390  * Add our "cheat replacements" to the map

     391  */

     392  foreach ($cheats as $hex => $map_to)

     393  {

     394      $map[hexdec($hex)] = $map_to;

     395  }

     396   

     397  /**

     398  * Split the map into smaller blocks

     399  */

     400  $file_contents = array();

     401  foreach ($map as $cp => $map_to)

     402  {

     403      $file_contents[$cp >> 11][cp_to_utf($cp)] = $map_to;

     404  }

     405  unset($map);

     406   

     407  foreach ($file_contents as $idx => $contents)

     408  {

     409      echo "Writing to search_indexer_$idx.$phpEx\n";

     410      $fp = fopen($phpbb_root_path . 'includes/utf/data/search_indexer_' . $idx . '.' . $phpEx, 'wb');

     411      fwrite($fp, '<?php return ' . my_var_export($contents) . ';');

     412      fclose($fp);

     413  }

     414  echo "\n*** Search indexer tables done\n\n";

     415   

     416   

     417  die("\nAll done!\n");

     418   

     419   

     420  ////////////////////////////////////////////////////////////////////////////////

     421  //                             Internal functions                             //

     422  ////////////////////////////////////////////////////////////////////////////////

     423   

     424  /**

     425  * Decompose a sequence recusively

     426  *

     427  * @param    array    $decomp_map    Decomposition mapping, passed by reference

     428  * @param    string    $decomp_seq    Decomposition sequence as decimal codepoints separated with a space

     429  * @return    string                Decomposition sequence, fully decomposed

     430  */

     431  function decompose(&$decomp_map, $decomp_seq)

     432  {

     433      $ret = array();

     434      foreach (explode(' ', $decomp_seq) as $cp)

     435      {

     436          if (isset($decomp_map[$cp]))

     437          {

     438              $ret[] = decompose($decomp_map, $decomp_map[$cp]);

     439          }

     440          else

     441          {

     442              $ret[] = $cp;

     443          }

     444      }

     445   

     446      return implode(' ', $ret);

     447  }

     448   

     449   

     450  /**

     451  * Return a parsable string representation of a variable

     452  *

     453  * This is function is limited to array/strings/integers

     454  *

     455  * @param    mixed    $var        Variable

     456  * @return    string                PHP code representing the variable

     457  */

     458  function my_var_export($var)

     459  {

     460      if (is_array($var))

     461      {

     462          $lines = array();

     463   

     464          foreach ($var as $k => $v)

     465          {

     466              $lines[] = my_var_export($k) . '=>' . my_var_export($v);

     467          }

     468   

     469          return 'array(' . implode(',', $lines) . ')';

     470      }

     471      else if (is_string($var))

     472      {

     473          return "'" . str_replace(array('\\', "'"), array('\\\\', "\\'"), $var) . "'";

     474      }

     475      else

     476      {

     477          return $var;

     478      }

     479  }

     480   

     481  /**

     482  * Download a file to the develop/ dir

     483  *

     484  * @param    string    $url        URL of the file to download

     485  * @return    void

     486  */

     487  function download($url)

     488  {

     489      global $phpbb_root_path;

     490   

     491      if (file_exists($phpbb_root_path . 'develop/' . basename($url)))

     492      {

     493          return;

     494      }

     495   

     496      echo 'Downloading from ', $url, ' ';

     497   

     498      if (!$fpr = fopen($url, 'rb'))

     499      {

     500          die("Can't download from $url\nPlease download it yourself and put it in the develop/ dir, kthxbai");

     501      }

     502   

     503      if (!$fpw = fopen($phpbb_root_path . 'develop/' . basename($url), 'wb'))

     504      {

     505          die("Can't open develop/" . basename($url) . " for output... please check your permissions or something");

     506      }

     507   

     508      $i = 0;

     509      $chunk = 32768;

     510      $done = '';

     511   

     512      while (!feof($fpr))

     513      {

     514          $i += fwrite($fpw, fread($fpr, $chunk));

     515          echo str_repeat("\x08", strlen($done));

     516   

     517          $done = ($i >> 10) . ' KiB';

     518          echo $done;

     519      }

     520      fclose($fpr);

     521      fclose($fpw);

     522   

     523      echo "\n";

     524  }

     525   

     526  /**

     527  * Convert a codepoint in hexadecimal to a UTF-8 char

     528  *

     529  * @param    string    $hex        Codepoint, in hexadecimal

     530  * @return    string                UTF-8 char

     531  */

     532  function hex_to_utf($hex)

     533  {

     534      return cp_to_utf(hexdec($hex));

     535  }

     536   

     537  /**

     538  * Return a UTF string formed from a sequence of codepoints in hexadecimal

     539  *

     540  * @param    string    $seq        Sequence of codepoints, separated with a space

     541  * @return    string                UTF-8 string

     542  */

     543  function hexseq_to_utf($seq)

     544  {

     545      return implode('', array_map('hex_to_utf', explode(' ', $seq)));

     546  }

     547   

     548  /**

     549  * Convert a codepoint to a UTF-8 char

     550  *

     551  * @param    integer    $cp            Unicode codepoint

     552  * @return    string                UTF-8 string

     553  */

     554  function cp_to_utf($cp)

     555  {

     556      if ($cp > 0xFFFF)

     557      {

     558          return chr(0xF0 | ($cp >> 18)) . chr(0x80 | (($cp >> 12) & 0x3F)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));

     559      }

     560      else if ($cp > 0x7FF)

     561      {

     562          return chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));

     563      }

     564      else if ($cp > 0x7F)

     565      {

     566          return chr(0xC0 | ($cp >> 6)) . chr(0x80 | ($cp & 0x3F));

     567      }

     568      else

     569      {

     570          return chr($cp);

     571      }

     572  }