Verzeichnisstruktur phpBB-3.3.15


Veröffentlicht
28.08.2024

So funktioniert es


Auf das letzte Element klicken. Dies geht jeweils ein Schritt zurück

Auf das Icon klicken, dies öffnet das Verzeichnis. Nochmal klicken schließt das Verzeichnis.
Auf den Verzeichnisnamen klicken, dies zeigt nur das Verzeichnis mit Inhalt an

(Beispiel Datei-Icons)

Auf das Icon klicken um den Quellcode anzuzeigen

RegexpParser.php

Zuletzt modifiziert: 02.04.2025, 15:04 - Dateigröße: 9.77 KiB


001  <?php
002   
003  /**
004  * @package   s9e\TextFormatter
005  * @copyright Copyright (c) 2010-2022 The s9e authors
006  * @license   http://www.opensource.org/licenses/mit-license.php The MIT License
007  */
008  namespace s9e\TextFormatter\Configurator\Helpers;
009   
010  use RuntimeException;
011   
012  abstract class RegexpParser
013  {
014      /**
015      * Generate a regexp that matches any single character allowed in a regexp
016      *
017      * This method will generate a regexp that can be used to determine whether a given character
018      * could in theory be allowed in a string that matches the source regexp. For example, the source
019      * regexp /^a+$/D would generate /a/ while /^foo\d+$/D would generate /[fo\d]/ whereas the regexp
020      * /foo/ would generate // because it's not anchored so any characters could be found before or
021      * after the literal "foo".
022      *
023      * @param  string $regexp Source regexp
024      * @return string         Regexp that matches any single character allowed in the source regexp
025      */
026      public static function getAllowedCharacterRegexp($regexp)
027      {
028          $def = self::parse($regexp);
029   
030          // If the regexp is uses the multiline modifier, this regexp can't match the whole string if
031          // it contains newlines, so in effect it could allow any content
032          if (strpos($def['modifiers'], 'm') !== false)
033          {
034              return '//';
035          }
036   
037          if (substr($def['regexp'], 0, 1) !== '^'
038           || substr($def['regexp'], -1)   !== '$')
039          {
040              return '//';
041          }
042   
043          // Append a token to mark the end of the regexp
044          $def['tokens'][] = [
045              'pos'  => strlen($def['regexp']),
046              'len'  => 0,
047              'type' => 'end'
048          ];
049   
050          $patterns = [];
051   
052          // Collect the literal portions of the source regexp while testing for alternations
053          $literal = '';
054          $pos     = 0;
055          $skipPos = 0;
056          $depth   = 0;
057          foreach ($def['tokens'] as $token)
058          {
059              // Skip options
060              if ($token['type'] === 'option')
061              {
062                  $skipPos = max($skipPos, $token['pos'] + $token['len']);
063              }
064   
065              // Skip assertions
066              if (strpos($token['type'], 'AssertionStart') !== false)
067              {
068                  $endToken = $def['tokens'][$token['endToken']];
069                  $skipPos  = max($skipPos, $endToken['pos'] + $endToken['len']);
070              }
071   
072              if ($token['pos'] >= $skipPos)
073              {
074                  if ($token['type'] === 'characterClass')
075                  {
076                      $patterns[] = '[' . $token['content'] . ']';
077                  }
078   
079                  if ($token['pos'] > $pos)
080                  {
081                      // Capture the content between last position and current position
082                      $tmp = substr($def['regexp'], $pos, $token['pos'] - $pos);
083   
084                      // Append the content to the literal portion
085                      $literal .= $tmp;
086   
087                      // Test for alternations if it's the root of the regexp
088                      if (!$depth)
089                      {
090                          // Remove literal backslashes for convenience
091                          $tmp = str_replace('\\\\', '', $tmp);
092   
093                          // Look for an unescaped | that is not followed by ^
094                          if (preg_match('/(?<!\\\\)\\|(?!\\^)/', $tmp))
095                          {
096                              return '//';
097                          }
098   
099                          // Look for an unescaped | that is not preceded by $
100                          if (preg_match('/(?<![$\\\\])\\|/', $tmp))
101                          {
102                              return '//';
103                          }
104                      }
105                  }
106              }
107   
108              if (substr($token['type'], -5) === 'Start')
109              {
110                  ++$depth;
111              }
112              elseif (substr($token['type'], -3) === 'End')
113              {
114                  --$depth;
115              }
116   
117              $pos = max($skipPos, $token['pos'] + $token['len']);
118          }
119   
120          // Test for the presence of an unescaped dot
121          if (preg_match('#(?<!\\\\)(?:\\\\\\\\)*\\.#', $literal))
122          {
123              if (strpos($def['modifiers'], 's') !== false
124               || strpos($literal, "\n") !== false)
125              {
126                  return '//';
127              }
128   
129              $patterns[] = '.';
130   
131              // Remove unescaped dots
132              $literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)\\.#', '$1', $literal);
133          }
134   
135          // Remove unescaped quantifiers *, + and ?
136          $literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)[*+?]#', '$1', $literal);
137   
138          // Remove unescaped quantifiers {}
139          $literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)\\{[^}]+\\}#', '$1', $literal);
140   
141          // Remove backslash assertions \b, \B, \A, \Z, \z and \G, as well as back references
142          $literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)\\\\[bBAZzG1-9]#', '$1', $literal);
143   
144          // Remove unescaped ^, | and $
145          $literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)[$^|]#', '$1', $literal);
146   
147          // Escape unescaped - and ] so they are safe to use in a character class
148          $literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)([-^\\]])#', '$1\\\\$2', $literal);
149   
150          // If the regexp doesn't use PCRE_DOLLAR_ENDONLY, it could end with a \n
151          if (strpos($def['modifiers'], 'D') === false)
152          {
153              $literal .= "\n";
154          }
155   
156          // Add the literal portion of the regexp to the patterns, as a character class
157          if ($literal !== '')
158          {
159              $patterns[] = '[' . $literal . ']';
160          }
161   
162          // Test whether this regexp actually matches anything
163          if (empty($patterns))
164          {
165              return '/^$/D';
166          }
167   
168          // Build the allowed characters regexp
169          $regexp = $def['delimiter'] . implode('|', $patterns) . $def['delimiter'];
170   
171          // Add the modifiers
172          if (strpos($def['modifiers'], 'i') !== false)
173          {
174              $regexp .= 'i';
175          }
176          if (strpos($def['modifiers'], 'u') !== false)
177          {
178              $regexp .= 'u';
179          }
180   
181          return $regexp;
182      }
183   
184      /**
185      * Return the name of each capture in given regexp
186      *
187      * Will return an empty string for unnamed captures
188      *
189      * @param  string   $regexp
190      * @return string[]
191      */
192      public static function getCaptureNames($regexp)
193      {
194          $map        = [''];
195          $regexpInfo = self::parse($regexp);
196          foreach ($regexpInfo['tokens'] as $tok)
197          {
198              if ($tok['type'] === 'capturingSubpatternStart')
199              {
200                  $map[] = $tok['name'] ?? '';
201              }
202          }
203   
204          return $map;
205      }
206   
207      /**
208      * @param  string $regexp
209      * @return array
210      */
211      public static function parse($regexp)
212      {
213          if (!preg_match('#^(.)(.*?)\\1([a-zA-Z]*)$#Ds', $regexp, $m))
214          {
215              throw new RuntimeException('Could not parse regexp delimiters');
216          }
217   
218          $ret = [
219              'delimiter' => $m[1],
220              'modifiers' => $m[3],
221              'regexp'    => $m[2],
222              'tokens'    => []
223          ];
224   
225          $regexp = $m[2];
226   
227          $openSubpatterns = [];
228   
229          $pos = 0;
230          $regexpLen = strlen($regexp);
231   
232          while ($pos < $regexpLen)
233          {
234              switch ($regexp[$pos])
235              {
236                  case '\\':
237                      // skip next character
238                      $pos += 2;
239                      break;
240   
241                  case '[':
242                      if (!preg_match('#\\[(.*?(?<!\\\\)(?:\\\\\\\\)*+)\\]((?:[+*][+?]?|\\?)?)#A', $regexp, $m, 0, $pos))
243                      {
244                          throw new RuntimeException('Could not find matching bracket from pos ' . $pos);
245                      }
246   
247                      $ret['tokens'][] = [
248                          'pos'         => $pos,
249                          'len'         => strlen($m[0]),
250                          'type'        => 'characterClass',
251                          'content'     => $m[1],
252                          'quantifiers' => $m[2]
253                      ];
254   
255                      $pos += strlen($m[0]);
256                      break;
257   
258                  case '(':
259                      if (preg_match('#\\(\\?([a-z]*)\\)#iA', $regexp, $m, 0, $pos))
260                      {
261                          // This is an option (?i) so we skip past the right parenthesis
262                          $ret['tokens'][] = [
263                              'pos'     => $pos,
264                              'len'     => strlen($m[0]),
265                              'type'    => 'option',
266                              'options' => $m[1]
267                          ];
268   
269                          $pos += strlen($m[0]);
270                          break;
271                      }
272   
273                      // This should be a subpattern, we just have to sniff which kind
274                      if (preg_match("#(?J)\\(\\?(?:P?<(?<name>[a-z_0-9]+)>|'(?<name>[a-z_0-9]+)')#A", $regexp, $m, \PREG_OFFSET_CAPTURE, $pos))
275                      {
276                          // This is a named capture
277                          $tok = [
278                              'pos'  => $pos,
279                              'len'  => strlen($m[0][0]),
280                              'type' => 'capturingSubpatternStart',
281                              'name' => $m['name'][0]
282                          ];
283   
284                          $pos += strlen($m[0][0]);
285                      }
286                      elseif (preg_match('#\\(\\?([a-z]*):#iA', $regexp, $m, 0, $pos))
287                      {
288                          // This is a non-capturing subpattern (?:xxx)
289                          $tok = [
290                              'pos'     => $pos,
291                              'len'     => strlen($m[0]),
292                              'type'    => 'nonCapturingSubpatternStart',
293                              'options' => $m[1]
294                          ];
295   
296                          $pos += strlen($m[0]);
297                      }
298                      elseif (preg_match('#\\(\\?>#iA', $regexp, $m, 0, $pos))
299                      {
300                          /* This is a non-capturing subpattern with atomic grouping "(?>x+)" */
301                          $tok = [
302                              'pos'     => $pos,
303                              'len'     => strlen($m[0]),
304                              'type'    => 'nonCapturingSubpatternStart',
305                              'subtype' => 'atomic'
306                          ];
307   
308                          $pos += strlen($m[0]);
309                      }
310                      elseif (preg_match('#\\(\\?(<?[!=])#A', $regexp, $m, 0, $pos))
311                      {
312                          // This is an assertion
313                          $assertions = [
314                              '='  => 'lookahead',
315                              '<=' => 'lookbehind',
316                              '!'  => 'negativeLookahead',
317                              '<!' => 'negativeLookbehind'
318                          ];
319   
320                          $tok = [
321                              'pos'     => $pos,
322                              'len'     => strlen($m[0]),
323                              'type'    => $assertions[$m[1]] . 'AssertionStart'
324                          ];
325   
326                          $pos += strlen($m[0]);
327                      }
328                      elseif (preg_match('#\\(\\?#A', $regexp, $m, 0, $pos))
329                      {
330                          throw new RuntimeException('Unsupported subpattern type at pos ' . $pos);
331                      }
332                      else
333                      {
334                          // This should be a normal capture
335                          $tok = [
336                              'pos'  => $pos,
337                              'len'  => 1,
338                              'type' => 'capturingSubpatternStart'
339                          ];
340   
341                          ++$pos;
342                      }
343   
344                      $openSubpatterns[] = count($ret['tokens']);
345                      $ret['tokens'][] = $tok;
346                      break;
347   
348                  case ')':
349                      if (empty($openSubpatterns))
350                      {
351                          throw new RuntimeException('Could not find matching pattern start for right parenthesis at pos ' . $pos);
352                      }
353   
354                      // Add the key to this token to its matching token and capture this subpattern's
355                      // content
356                      $k = array_pop($openSubpatterns);
357                      $startToken =& $ret['tokens'][$k];
358                      $startToken['endToken'] = count($ret['tokens']);
359                      $startToken['content']  = substr(
360                          $regexp,
361                          $startToken['pos'] + $startToken['len'],
362                          $pos - ($startToken['pos'] + $startToken['len'])
363                      );
364   
365                      // Look for quantifiers after the subpattern, e.g. (?:ab)++
366                      $spn = strspn($regexp, '+*?', 1 + $pos);
367                      $quantifiers = substr($regexp, 1 + $pos, $spn);
368   
369                      $ret['tokens'][] = [
370                          'pos'  => $pos,
371                          'len'  => 1 + $spn,
372                          'type' => substr($startToken['type'], 0, -5) . 'End',
373                          'quantifiers' => $quantifiers
374                      ];
375   
376                      unset($startToken);
377   
378                      $pos += 1 + $spn;
379                      break;
380   
381                  default:
382                      ++$pos;
383              }
384          }
385   
386          if (!empty($openSubpatterns))
387          {
388              throw new RuntimeException('Could not find matching pattern end for left parenthesis at pos ' . $ret['tokens'][$openSubpatterns[0]]['pos']);
389          }
390   
391          return $ret;
392      }
393  }