4 * Copyright 2008 Konrad Rudolph
7 * Permission is hereby granted, free of charge, to any person obtaining a copy
8 * of this software and associated documentation files (the "Software"), to deal
9 * in the Software without restriction, including without limitation the rights
10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 * copies of the Software, and to permit persons to whom the Software is
12 * furnished to do so, subject to the following conditions:
14 * The above copyright notice and this permission notice shall be included in
15 * all copies or substantial portions of the Software.
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
27 * Helper functions for the Perl-compatible regular expressions.
28 * @package preg_helper
32 * Merges several regular expressions into one, using the indicated 'glue'.
34 * This function takes care of individual modifiers so it's safe to use
35 * <i>different</i> modifiers on the individual expressions. The order of
36 * sub-matches is preserved as well. Numbered back-references are adapted to
37 * the new overall sub-match count. This means that it's safe to use numbered
38 * back-refences in the individual expressions!
39 * If {@link $names} is given, the individual expressions are captured in
40 * named sub-matches using the contents of that array as names.
41 * Matching pair-delimiters (e.g. <var>"{…}"</var>) are currently
42 * <b>not</b> supported.
44 * The function assumes that all regular expressions are well-formed.
45 * Behaviour is undefined if they aren't.
47 * This function was created after a
48 * {@link http://stackoverflow.com/questions/244959/ StackOverflow discussion}.
49 * Much of it was written or thought of by “porneL” and “eyelidlessness”. Many
50 * thanks to both of them.
52 * @param string $glue A string to insert between the individual expressions.
53 * This should usually be either the empty string, indicating
54 * concatenation, or the pipe (<var>"|"</var>), indicating alternation.
55 * Notice that this string might have to be escaped since it is treated
56 * as a normal character in a regular expression (i.e. <var>"/"</var> will
57 * end the expression and result in an invalid output).
58 * @param array $expressions The expressions to merge. The expressions may
59 * have arbitrary different delimiters and modifiers.
60 * @param array $names Optional. This is either an empty array or an array of
61 * strings of the same length as {@link $expressions}. In that case,
62 * the strings of this array are used to create named sub-matches for the
64 * @return string An string representing a regular expression equivalent to the
65 * merged expressions. Returns <var>FALSE</var> if an error occurred.
67 function preg_merge($glue, array $expressions, array $names = array()) {
68 // … then, a miracle occurs.
72 $use_names = ($names !== null and count($names) !== 0);
75 $use_names and count($names) !== count($expressions) or
81 // For keeping track of the names for sub-matches.
83 // For keeping track of *all* captures to re-adjust backreferences.
86 foreach ($expressions as $expression) {
88 $name = str_replace(' ', '_', $names[$names_count++]);
90 // Get delimiters and modifiers:
92 $stripped = preg_strip($expression);
94 if ($stripped === false)
97 list($sub_expr, $modifiers) = $stripped;
99 // Re-adjust backreferences:
100 // TODO What about \R backreferences (\0 isn't allowed, though)?
102 // We assume that the expression is correct and therefore don't check
103 // for matching parentheses.
105 $number_of_captures = preg_match_all('/\([^?]|\(\?[^:]/', $sub_expr, $_);
107 if ($number_of_captures === false)
110 if ($number_of_captures > 0) {
112 (?<!\\\\) # Not preceded by a backslash,
113 ((?:\\\\\\\\)*?) # zero or more escaped backslashes,
114 \\\\ (\d+) # followed by backslash plus digits.
116 $sub_expr = preg_replace_callback(
120 'return $m[1] . "\\\\" . ((int)$m[2] + ' . $capture_count . ');'
124 $capture_count += $number_of_captures;
127 // Last, construct the new sub-match:
129 $modifiers = implode('', $modifiers);
130 $sub_modifiers = "(?$modifiers)";
131 if ($sub_modifiers === '(?)')
134 $sub_name = $use_names ? "?<$name>" : '?:';
135 $new_expr = "($sub_name$sub_modifiers$sub_expr)";
136 $result[] = $new_expr;
139 return '/' . implode($glue, $result) . '/';
143 * Strips a regular expression string off its delimiters and modifiers.
144 * Additionally, normalizes the delimiters (i.e. reformats the pattern so that
145 * it could have used <var>"/"</var> as delimiter).
147 * @param string $expression The regular expression string to strip.
148 * @return array An array whose first entry is the expression itself, the
149 * second an array of delimiters. If the argument is not a valid regular
150 * expression, returns <var>FALSE</var>.
153 function preg_strip($expression) {
154 if (preg_match('/^(.)(.*)\\1([imsxeADSUXJu]*)$/s', $expression, $matches) !== 1)
157 $delim = $matches[1];
158 $sub_expr = $matches[2];
159 if ($delim !== '/') {
160 // Replace occurrences by the escaped delimiter by its unescaped
161 // version and escape new delimiter.
162 $sub_expr = str_replace("\\$delim", $delim, $sub_expr);
163 $sub_expr = str_replace('/', '\\/', $sub_expr);
165 $modifiers = $matches[3] === '' ? array() : str_split(trim($matches[3]));
167 return array($sub_expr, $modifiers);