Verzeichnisstruktur phpBB-3.3.15
- Veröffentlicht
- 28.08.2024
So funktioniert es
|
Auf das letzte Element klicken. Dies geht jeweils ein Schritt zurück |
Auf das Icon klicken, dies öffnet das Verzeichnis. Nochmal klicken schließt das Verzeichnis. |
|
(Beispiel Datei-Icons)
|
Auf das Icon klicken um den Quellcode anzuzeigen |
RegexpParser.php
001 <?php
002
003 /**
004 * @package s9e\TextFormatter
005 * @copyright Copyright (c) 2010-2022 The s9e authors
006 * @license http://www.opensource.org/licenses/mit-license.php The MIT License
007 */
008 namespace s9e\TextFormatter\Configurator\Helpers;
009
010 use RuntimeException;
011
012 abstract class RegexpParser
013 {
014 /**
015 * Generate a regexp that matches any single character allowed in a regexp
016 *
017 * This method will generate a regexp that can be used to determine whether a given character
018 * could in theory be allowed in a string that matches the source regexp. For example, the source
019 * regexp /^a+$/D would generate /a/ while /^foo\d+$/D would generate /[fo\d]/ whereas the regexp
020 * /foo/ would generate // because it's not anchored so any characters could be found before or
021 * after the literal "foo".
022 *
023 * @param string $regexp Source regexp
024 * @return string Regexp that matches any single character allowed in the source regexp
025 */
026 public static function getAllowedCharacterRegexp($regexp)
027 {
028 $def = self::parse($regexp);
029
030 // If the regexp is uses the multiline modifier, this regexp can't match the whole string if
031 // it contains newlines, so in effect it could allow any content
032 if (strpos($def['modifiers'], 'm') !== false)
033 {
034 return '//';
035 }
036
037 if (substr($def['regexp'], 0, 1) !== '^'
038 || substr($def['regexp'], -1) !== '$')
039 {
040 return '//';
041 }
042
043 // Append a token to mark the end of the regexp
044 $def['tokens'][] = [
045 'pos' => strlen($def['regexp']),
046 'len' => 0,
047 'type' => 'end'
048 ];
049
050 $patterns = [];
051
052 // Collect the literal portions of the source regexp while testing for alternations
053 $literal = '';
054 $pos = 0;
055 $skipPos = 0;
056 $depth = 0;
057 foreach ($def['tokens'] as $token)
058 {
059 // Skip options
060 if ($token['type'] === 'option')
061 {
062 $skipPos = max($skipPos, $token['pos'] + $token['len']);
063 }
064
065 // Skip assertions
066 if (strpos($token['type'], 'AssertionStart') !== false)
067 {
068 $endToken = $def['tokens'][$token['endToken']];
069 $skipPos = max($skipPos, $endToken['pos'] + $endToken['len']);
070 }
071
072 if ($token['pos'] >= $skipPos)
073 {
074 if ($token['type'] === 'characterClass')
075 {
076 $patterns[] = '[' . $token['content'] . ']';
077 }
078
079 if ($token['pos'] > $pos)
080 {
081 // Capture the content between last position and current position
082 $tmp = substr($def['regexp'], $pos, $token['pos'] - $pos);
083
084 // Append the content to the literal portion
085 $literal .= $tmp;
086
087 // Test for alternations if it's the root of the regexp
088 if (!$depth)
089 {
090 // Remove literal backslashes for convenience
091 $tmp = str_replace('\\\\', '', $tmp);
092
093 // Look for an unescaped | that is not followed by ^
094 if (preg_match('/(?<!\\\\)\\|(?!\\^)/', $tmp))
095 {
096 return '//';
097 }
098
099 // Look for an unescaped | that is not preceded by $
100 if (preg_match('/(?<![$\\\\])\\|/', $tmp))
101 {
102 return '//';
103 }
104 }
105 }
106 }
107
108 if (substr($token['type'], -5) === 'Start')
109 {
110 ++$depth;
111 }
112 elseif (substr($token['type'], -3) === 'End')
113 {
114 --$depth;
115 }
116
117 $pos = max($skipPos, $token['pos'] + $token['len']);
118 }
119
120 // Test for the presence of an unescaped dot
121 if (preg_match('#(?<!\\\\)(?:\\\\\\\\)*\\.#', $literal))
122 {
123 if (strpos($def['modifiers'], 's') !== false
124 || strpos($literal, "\n") !== false)
125 {
126 return '//';
127 }
128
129 $patterns[] = '.';
130
131 // Remove unescaped dots
132 $literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)\\.#', '$1', $literal);
133 }
134
135 // Remove unescaped quantifiers *, + and ?
136 $literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)[*+?]#', '$1', $literal);
137
138 // Remove unescaped quantifiers {}
139 $literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)\\{[^}]+\\}#', '$1', $literal);
140
141 // Remove backslash assertions \b, \B, \A, \Z, \z and \G, as well as back references
142 $literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)\\\\[bBAZzG1-9]#', '$1', $literal);
143
144 // Remove unescaped ^, | and $
145 $literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)[$^|]#', '$1', $literal);
146
147 // Escape unescaped - and ] so they are safe to use in a character class
148 $literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)([-^\\]])#', '$1\\\\$2', $literal);
149
150 // If the regexp doesn't use PCRE_DOLLAR_ENDONLY, it could end with a \n
151 if (strpos($def['modifiers'], 'D') === false)
152 {
153 $literal .= "\n";
154 }
155
156 // Add the literal portion of the regexp to the patterns, as a character class
157 if ($literal !== '')
158 {
159 $patterns[] = '[' . $literal . ']';
160 }
161
162 // Test whether this regexp actually matches anything
163 if (empty($patterns))
164 {
165 return '/^$/D';
166 }
167
168 // Build the allowed characters regexp
169 $regexp = $def['delimiter'] . implode('|', $patterns) . $def['delimiter'];
170
171 // Add the modifiers
172 if (strpos($def['modifiers'], 'i') !== false)
173 {
174 $regexp .= 'i';
175 }
176 if (strpos($def['modifiers'], 'u') !== false)
177 {
178 $regexp .= 'u';
179 }
180
181 return $regexp;
182 }
183
184 /**
185 * Return the name of each capture in given regexp
186 *
187 * Will return an empty string for unnamed captures
188 *
189 * @param string $regexp
190 * @return string[]
191 */
192 public static function getCaptureNames($regexp)
193 {
194 $map = [''];
195 $regexpInfo = self::parse($regexp);
196 foreach ($regexpInfo['tokens'] as $tok)
197 {
198 if ($tok['type'] === 'capturingSubpatternStart')
199 {
200 $map[] = $tok['name'] ?? '';
201 }
202 }
203
204 return $map;
205 }
206
207 /**
208 * @param string $regexp
209 * @return array
210 */
211 public static function parse($regexp)
212 {
213 if (!preg_match('#^(.)(.*?)\\1([a-zA-Z]*)$#Ds', $regexp, $m))
214 {
215 throw new RuntimeException('Could not parse regexp delimiters');
216 }
217
218 $ret = [
219 'delimiter' => $m[1],
220 'modifiers' => $m[3],
221 'regexp' => $m[2],
222 'tokens' => []
223 ];
224
225 $regexp = $m[2];
226
227 $openSubpatterns = [];
228
229 $pos = 0;
230 $regexpLen = strlen($regexp);
231
232 while ($pos < $regexpLen)
233 {
234 switch ($regexp[$pos])
235 {
236 case '\\':
237 // skip next character
238 $pos += 2;
239 break;
240
241 case '[':
242 if (!preg_match('#\\[(.*?(?<!\\\\)(?:\\\\\\\\)*+)\\]((?:[+*][+?]?|\\?)?)#A', $regexp, $m, 0, $pos))
243 {
244 throw new RuntimeException('Could not find matching bracket from pos ' . $pos);
245 }
246
247 $ret['tokens'][] = [
248 'pos' => $pos,
249 'len' => strlen($m[0]),
250 'type' => 'characterClass',
251 'content' => $m[1],
252 'quantifiers' => $m[2]
253 ];
254
255 $pos += strlen($m[0]);
256 break;
257
258 case '(':
259 if (preg_match('#\\(\\?([a-z]*)\\)#iA', $regexp, $m, 0, $pos))
260 {
261 // This is an option (?i) so we skip past the right parenthesis
262 $ret['tokens'][] = [
263 'pos' => $pos,
264 'len' => strlen($m[0]),
265 'type' => 'option',
266 'options' => $m[1]
267 ];
268
269 $pos += strlen($m[0]);
270 break;
271 }
272
273 // This should be a subpattern, we just have to sniff which kind
274 if (preg_match("#(?J)\\(\\?(?:P?<(?<name>[a-z_0-9]+)>|'(?<name>[a-z_0-9]+)')#A", $regexp, $m, \PREG_OFFSET_CAPTURE, $pos))
275 {
276 // This is a named capture
277 $tok = [
278 'pos' => $pos,
279 'len' => strlen($m[0][0]),
280 'type' => 'capturingSubpatternStart',
281 'name' => $m['name'][0]
282 ];
283
284 $pos += strlen($m[0][0]);
285 }
286 elseif (preg_match('#\\(\\?([a-z]*):#iA', $regexp, $m, 0, $pos))
287 {
288 // This is a non-capturing subpattern (?:xxx)
289 $tok = [
290 'pos' => $pos,
291 'len' => strlen($m[0]),
292 'type' => 'nonCapturingSubpatternStart',
293 'options' => $m[1]
294 ];
295
296 $pos += strlen($m[0]);
297 }
298 elseif (preg_match('#\\(\\?>#iA', $regexp, $m, 0, $pos))
299 {
300 /* This is a non-capturing subpattern with atomic grouping "(?>x+)" */
301 $tok = [
302 'pos' => $pos,
303 'len' => strlen($m[0]),
304 'type' => 'nonCapturingSubpatternStart',
305 'subtype' => 'atomic'
306 ];
307
308 $pos += strlen($m[0]);
309 }
310 elseif (preg_match('#\\(\\?(<?[!=])#A', $regexp, $m, 0, $pos))
311 {
312 // This is an assertion
313 $assertions = [
314 '=' => 'lookahead',
315 '<=' => 'lookbehind',
316 '!' => 'negativeLookahead',
317 '<!' => 'negativeLookbehind'
318 ];
319
320 $tok = [
321 'pos' => $pos,
322 'len' => strlen($m[0]),
323 'type' => $assertions[$m[1]] . 'AssertionStart'
324 ];
325
326 $pos += strlen($m[0]);
327 }
328 elseif (preg_match('#\\(\\?#A', $regexp, $m, 0, $pos))
329 {
330 throw new RuntimeException('Unsupported subpattern type at pos ' . $pos);
331 }
332 else
333 {
334 // This should be a normal capture
335 $tok = [
336 'pos' => $pos,
337 'len' => 1,
338 'type' => 'capturingSubpatternStart'
339 ];
340
341 ++$pos;
342 }
343
344 $openSubpatterns[] = count($ret['tokens']);
345 $ret['tokens'][] = $tok;
346 break;
347
348 case ')':
349 if (empty($openSubpatterns))
350 {
351 throw new RuntimeException('Could not find matching pattern start for right parenthesis at pos ' . $pos);
352 }
353
354 // Add the key to this token to its matching token and capture this subpattern's
355 // content
356 $k = array_pop($openSubpatterns);
357 $startToken =& $ret['tokens'][$k];
358 $startToken['endToken'] = count($ret['tokens']);
359 $startToken['content'] = substr(
360 $regexp,
361 $startToken['pos'] + $startToken['len'],
362 $pos - ($startToken['pos'] + $startToken['len'])
363 );
364
365 // Look for quantifiers after the subpattern, e.g. (?:ab)++
366 $spn = strspn($regexp, '+*?', 1 + $pos);
367 $quantifiers = substr($regexp, 1 + $pos, $spn);
368
369 $ret['tokens'][] = [
370 'pos' => $pos,
371 'len' => 1 + $spn,
372 'type' => substr($startToken['type'], 0, -5) . 'End',
373 'quantifiers' => $quantifiers
374 ];
375
376 unset($startToken);
377
378 $pos += 1 + $spn;
379 break;
380
381 default:
382 ++$pos;
383 }
384 }
385
386 if (!empty($openSubpatterns))
387 {
388 throw new RuntimeException('Could not find matching pattern end for left parenthesis at pos ' . $ret['tokens'][$openSubpatterns[0]]['pos']);
389 }
390
391 return $ret;
392 }
393 }