7 * This source file is subject to the new BSD license that is bundled
8 * with this package in the file LICENSE.txt.
9 * It is also available through the world-wide-web at this URL:
10 * http://framework.zend.com/license/new-bsd
11 * If you did not receive a copy of the license and are unable to
12 * obtain it through the world-wide-web, please send an email
13 * to license@zend.com so we can send you a copy immediately.
16 * @package Zend_Search_Lucene
18 * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
19 * @license http://framework.zend.com/license/new-bsd New BSD License
20 * @version $Id: QueryLexer.php 16971 2009-07-22 18:05:45Z mikaelkael $
23 /** Zend_Search_Lucene_FSM */
24 require_once 'Zend/Search/Lucene/FSM.php';
26 /** Zend_Search_Lucene_Search_QueryParser */
27 require_once 'Zend/Search/Lucene/Search/QueryToken.php';
31 * @package Zend_Search_Lucene
33 * @copyright Copyright (c) 2005-2009 Zend Technologies USA Inc. (http://www.zend.com)
34 * @license http://framework.zend.com/license/new-bsd New BSD License
36 class Zend_Search_Lucene_Search_QueryLexer extends Zend_Search_Lucene_FSM
38 /** State Machine states */
39 const ST_WHITE_SPACE = 0;
40 const ST_SYNT_LEXEME = 1;
42 const ST_QUOTED_LEXEME = 3;
43 const ST_ESCAPED_CHAR = 4;
44 const ST_ESCAPED_QCHAR = 5;
45 const ST_LEXEME_MODIFIER = 6;
47 const ST_MANTISSA = 8;
51 const IN_WHITE_SPACE = 0;
52 const IN_SYNT_CHAR = 1;
53 const IN_LEXEME_MODIFIER = 2;
54 const IN_ESCAPE_CHAR = 3;
56 const IN_DECIMAL_POINT = 5;
57 const IN_ASCII_DIGIT = 6;
59 const IN_MUTABLE_CHAR = 8;
61 const QUERY_WHITE_SPACE_CHARS = " \n\r\t";
62 const QUERY_SYNT_CHARS = ':()[]{}!|&';
63 const QUERY_MUTABLE_CHARS = '+-';
64 const QUERY_DOUBLECHARLEXEME_CHARS = '|&';
65 const QUERY_LEXEMEMODIFIER_CHARS = '~^';
66 const QUERY_ASCIIDIGITS_CHARS = '0123456789';
69 * List of recognized lexemes
76 * Query string (array of single- or non single-byte characters)
80 private $_queryString;
83 * Current position within a query string
84 * Used to create appropriate error messages
88 private $_queryStringPosition;
91 * Recognized part of current lexeme
95 private $_currentLexeme;
97 public function __construct()
99 parent::__construct( array(self::ST_WHITE_SPACE,
100 self::ST_SYNT_LEXEME,
102 self::ST_QUOTED_LEXEME,
103 self::ST_ESCAPED_CHAR,
104 self::ST_ESCAPED_QCHAR,
105 self::ST_LEXEME_MODIFIER,
109 array(self::IN_WHITE_SPACE,
111 self::IN_MUTABLE_CHAR,
112 self::IN_LEXEME_MODIFIER,
113 self::IN_ESCAPE_CHAR,
115 self::IN_DECIMAL_POINT,
116 self::IN_ASCII_DIGIT,
120 $lexemeModifierErrorAction = new Zend_Search_Lucene_FSMAction($this, 'lexModifierErrException');
121 $quoteWithinLexemeErrorAction = new Zend_Search_Lucene_FSMAction($this, 'quoteWithinLexemeErrException');
122 $wrongNumberErrorAction = new Zend_Search_Lucene_FSMAction($this, 'wrongNumberErrException');
126 $this->addRules(array( array(self::ST_WHITE_SPACE, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE),
127 array(self::ST_WHITE_SPACE, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME),
128 array(self::ST_WHITE_SPACE, self::IN_MUTABLE_CHAR, self::ST_SYNT_LEXEME),
129 array(self::ST_WHITE_SPACE, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
130 array(self::ST_WHITE_SPACE, self::IN_ESCAPE_CHAR, self::ST_ESCAPED_CHAR),
131 array(self::ST_WHITE_SPACE, self::IN_QUOTE, self::ST_QUOTED_LEXEME),
132 array(self::ST_WHITE_SPACE, self::IN_DECIMAL_POINT, self::ST_LEXEME),
133 array(self::ST_WHITE_SPACE, self::IN_ASCII_DIGIT, self::ST_LEXEME),
134 array(self::ST_WHITE_SPACE, self::IN_CHAR, self::ST_LEXEME)
136 $this->addRules(array( array(self::ST_SYNT_LEXEME, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE),
137 array(self::ST_SYNT_LEXEME, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME),
138 array(self::ST_SYNT_LEXEME, self::IN_MUTABLE_CHAR, self::ST_SYNT_LEXEME),
139 array(self::ST_SYNT_LEXEME, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
140 array(self::ST_SYNT_LEXEME, self::IN_ESCAPE_CHAR, self::ST_ESCAPED_CHAR),
141 array(self::ST_SYNT_LEXEME, self::IN_QUOTE, self::ST_QUOTED_LEXEME),
142 array(self::ST_SYNT_LEXEME, self::IN_DECIMAL_POINT, self::ST_LEXEME),
143 array(self::ST_SYNT_LEXEME, self::IN_ASCII_DIGIT, self::ST_LEXEME),
144 array(self::ST_SYNT_LEXEME, self::IN_CHAR, self::ST_LEXEME)
146 $this->addRules(array( array(self::ST_LEXEME, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE),
147 array(self::ST_LEXEME, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME),
148 array(self::ST_LEXEME, self::IN_MUTABLE_CHAR, self::ST_LEXEME),
149 array(self::ST_LEXEME, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
150 array(self::ST_LEXEME, self::IN_ESCAPE_CHAR, self::ST_ESCAPED_CHAR),
152 // IN_QUOTE not allowed
153 array(self::ST_LEXEME, self::IN_QUOTE, self::ST_ERROR, $quoteWithinLexemeErrorAction),
155 array(self::ST_LEXEME, self::IN_DECIMAL_POINT, self::ST_LEXEME),
156 array(self::ST_LEXEME, self::IN_ASCII_DIGIT, self::ST_LEXEME),
157 array(self::ST_LEXEME, self::IN_CHAR, self::ST_LEXEME)
159 $this->addRules(array( array(self::ST_QUOTED_LEXEME, self::IN_WHITE_SPACE, self::ST_QUOTED_LEXEME),
160 array(self::ST_QUOTED_LEXEME, self::IN_SYNT_CHAR, self::ST_QUOTED_LEXEME),
161 array(self::ST_QUOTED_LEXEME, self::IN_MUTABLE_CHAR, self::ST_QUOTED_LEXEME),
162 array(self::ST_QUOTED_LEXEME, self::IN_LEXEME_MODIFIER, self::ST_QUOTED_LEXEME),
163 array(self::ST_QUOTED_LEXEME, self::IN_ESCAPE_CHAR, self::ST_ESCAPED_QCHAR),
164 array(self::ST_QUOTED_LEXEME, self::IN_QUOTE, self::ST_WHITE_SPACE),
165 array(self::ST_QUOTED_LEXEME, self::IN_DECIMAL_POINT, self::ST_QUOTED_LEXEME),
166 array(self::ST_QUOTED_LEXEME, self::IN_ASCII_DIGIT, self::ST_QUOTED_LEXEME),
167 array(self::ST_QUOTED_LEXEME, self::IN_CHAR, self::ST_QUOTED_LEXEME)
169 $this->addRules(array( array(self::ST_ESCAPED_CHAR, self::IN_WHITE_SPACE, self::ST_LEXEME),
170 array(self::ST_ESCAPED_CHAR, self::IN_SYNT_CHAR, self::ST_LEXEME),
171 array(self::ST_ESCAPED_CHAR, self::IN_MUTABLE_CHAR, self::ST_LEXEME),
172 array(self::ST_ESCAPED_CHAR, self::IN_LEXEME_MODIFIER, self::ST_LEXEME),
173 array(self::ST_ESCAPED_CHAR, self::IN_ESCAPE_CHAR, self::ST_LEXEME),
174 array(self::ST_ESCAPED_CHAR, self::IN_QUOTE, self::ST_LEXEME),
175 array(self::ST_ESCAPED_CHAR, self::IN_DECIMAL_POINT, self::ST_LEXEME),
176 array(self::ST_ESCAPED_CHAR, self::IN_ASCII_DIGIT, self::ST_LEXEME),
177 array(self::ST_ESCAPED_CHAR, self::IN_CHAR, self::ST_LEXEME)
179 $this->addRules(array( array(self::ST_ESCAPED_QCHAR, self::IN_WHITE_SPACE, self::ST_QUOTED_LEXEME),
180 array(self::ST_ESCAPED_QCHAR, self::IN_SYNT_CHAR, self::ST_QUOTED_LEXEME),
181 array(self::ST_ESCAPED_QCHAR, self::IN_MUTABLE_CHAR, self::ST_QUOTED_LEXEME),
182 array(self::ST_ESCAPED_QCHAR, self::IN_LEXEME_MODIFIER, self::ST_QUOTED_LEXEME),
183 array(self::ST_ESCAPED_QCHAR, self::IN_ESCAPE_CHAR, self::ST_QUOTED_LEXEME),
184 array(self::ST_ESCAPED_QCHAR, self::IN_QUOTE, self::ST_QUOTED_LEXEME),
185 array(self::ST_ESCAPED_QCHAR, self::IN_DECIMAL_POINT, self::ST_QUOTED_LEXEME),
186 array(self::ST_ESCAPED_QCHAR, self::IN_ASCII_DIGIT, self::ST_QUOTED_LEXEME),
187 array(self::ST_ESCAPED_QCHAR, self::IN_CHAR, self::ST_QUOTED_LEXEME)
189 $this->addRules(array( array(self::ST_LEXEME_MODIFIER, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE),
190 array(self::ST_LEXEME_MODIFIER, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME),
191 array(self::ST_LEXEME_MODIFIER, self::IN_MUTABLE_CHAR, self::ST_SYNT_LEXEME),
192 array(self::ST_LEXEME_MODIFIER, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
194 // IN_ESCAPE_CHAR not allowed
195 array(self::ST_LEXEME_MODIFIER, self::IN_ESCAPE_CHAR, self::ST_ERROR, $lexemeModifierErrorAction),
197 // IN_QUOTE not allowed
198 array(self::ST_LEXEME_MODIFIER, self::IN_QUOTE, self::ST_ERROR, $lexemeModifierErrorAction),
201 array(self::ST_LEXEME_MODIFIER, self::IN_DECIMAL_POINT, self::ST_MANTISSA),
202 array(self::ST_LEXEME_MODIFIER, self::IN_ASCII_DIGIT, self::ST_NUMBER),
204 // IN_CHAR not allowed
205 array(self::ST_LEXEME_MODIFIER, self::IN_CHAR, self::ST_ERROR, $lexemeModifierErrorAction),
207 $this->addRules(array( array(self::ST_NUMBER, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE),
208 array(self::ST_NUMBER, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME),
209 array(self::ST_NUMBER, self::IN_MUTABLE_CHAR, self::ST_SYNT_LEXEME),
210 array(self::ST_NUMBER, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
212 // IN_ESCAPE_CHAR not allowed
213 array(self::ST_NUMBER, self::IN_ESCAPE_CHAR, self::ST_ERROR, $wrongNumberErrorAction),
215 // IN_QUOTE not allowed
216 array(self::ST_NUMBER, self::IN_QUOTE, self::ST_ERROR, $wrongNumberErrorAction),
218 array(self::ST_NUMBER, self::IN_DECIMAL_POINT, self::ST_MANTISSA),
219 array(self::ST_NUMBER, self::IN_ASCII_DIGIT, self::ST_NUMBER),
221 // IN_CHAR not allowed
222 array(self::ST_NUMBER, self::IN_CHAR, self::ST_ERROR, $wrongNumberErrorAction),
224 $this->addRules(array( array(self::ST_MANTISSA, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE),
225 array(self::ST_MANTISSA, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME),
226 array(self::ST_MANTISSA, self::IN_MUTABLE_CHAR, self::ST_SYNT_LEXEME),
227 array(self::ST_MANTISSA, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
229 // IN_ESCAPE_CHAR not allowed
230 array(self::ST_MANTISSA, self::IN_ESCAPE_CHAR, self::ST_ERROR, $wrongNumberErrorAction),
232 // IN_QUOTE not allowed
233 array(self::ST_MANTISSA, self::IN_QUOTE, self::ST_ERROR, $wrongNumberErrorAction),
235 // IN_DECIMAL_POINT not allowed
236 array(self::ST_MANTISSA, self::IN_DECIMAL_POINT, self::ST_ERROR, $wrongNumberErrorAction),
238 array(self::ST_MANTISSA, self::IN_ASCII_DIGIT, self::ST_MANTISSA),
240 // IN_CHAR not allowed
241 array(self::ST_MANTISSA, self::IN_CHAR, self::ST_ERROR, $wrongNumberErrorAction),
246 $syntaxLexemeAction = new Zend_Search_Lucene_FSMAction($this, 'addQuerySyntaxLexeme');
247 $lexemeModifierAction = new Zend_Search_Lucene_FSMAction($this, 'addLexemeModifier');
248 $addLexemeAction = new Zend_Search_Lucene_FSMAction($this, 'addLexeme');
249 $addQuotedLexemeAction = new Zend_Search_Lucene_FSMAction($this, 'addQuotedLexeme');
250 $addNumberLexemeAction = new Zend_Search_Lucene_FSMAction($this, 'addNumberLexeme');
251 $addLexemeCharAction = new Zend_Search_Lucene_FSMAction($this, 'addLexemeChar');
255 $this->addEntryAction(self::ST_SYNT_LEXEME, $syntaxLexemeAction);
256 // Two lexemes in succession
257 $this->addTransitionAction(self::ST_SYNT_LEXEME, self::ST_SYNT_LEXEME, $syntaxLexemeAction);
261 $this->addEntryAction(self::ST_LEXEME, $addLexemeCharAction);
262 $this->addTransitionAction(self::ST_LEXEME, self::ST_LEXEME, $addLexemeCharAction);
263 // ST_ESCAPED_CHAR => ST_LEXEME transition is covered by ST_LEXEME entry action
265 $this->addTransitionAction(self::ST_LEXEME, self::ST_WHITE_SPACE, $addLexemeAction);
266 $this->addTransitionAction(self::ST_LEXEME, self::ST_SYNT_LEXEME, $addLexemeAction);
267 $this->addTransitionAction(self::ST_LEXEME, self::ST_QUOTED_LEXEME, $addLexemeAction);
268 $this->addTransitionAction(self::ST_LEXEME, self::ST_LEXEME_MODIFIER, $addLexemeAction);
269 $this->addTransitionAction(self::ST_LEXEME, self::ST_NUMBER, $addLexemeAction);
270 $this->addTransitionAction(self::ST_LEXEME, self::ST_MANTISSA, $addLexemeAction);
274 // We don't need entry action (skeep quote)
275 $this->addTransitionAction(self::ST_QUOTED_LEXEME, self::ST_QUOTED_LEXEME, $addLexemeCharAction);
276 $this->addTransitionAction(self::ST_ESCAPED_QCHAR, self::ST_QUOTED_LEXEME, $addLexemeCharAction);
277 // Closing quote changes state to the ST_WHITE_SPACE other states are not used
278 $this->addTransitionAction(self::ST_QUOTED_LEXEME, self::ST_WHITE_SPACE, $addQuotedLexemeAction);
281 /** Lexeme modifier */
282 $this->addEntryAction(self::ST_LEXEME_MODIFIER, $lexemeModifierAction);
286 $this->addEntryAction(self::ST_NUMBER, $addLexemeCharAction);
287 $this->addEntryAction(self::ST_MANTISSA, $addLexemeCharAction);
288 $this->addTransitionAction(self::ST_NUMBER, self::ST_NUMBER, $addLexemeCharAction);
289 // ST_NUMBER => ST_MANTISSA transition is covered by ST_MANTISSA entry action
290 $this->addTransitionAction(self::ST_MANTISSA, self::ST_MANTISSA, $addLexemeCharAction);
292 $this->addTransitionAction(self::ST_NUMBER, self::ST_WHITE_SPACE, $addNumberLexemeAction);
293 $this->addTransitionAction(self::ST_NUMBER, self::ST_SYNT_LEXEME, $addNumberLexemeAction);
294 $this->addTransitionAction(self::ST_NUMBER, self::ST_LEXEME_MODIFIER, $addNumberLexemeAction);
295 $this->addTransitionAction(self::ST_MANTISSA, self::ST_WHITE_SPACE, $addNumberLexemeAction);
296 $this->addTransitionAction(self::ST_MANTISSA, self::ST_SYNT_LEXEME, $addNumberLexemeAction);
297 $this->addTransitionAction(self::ST_MANTISSA, self::ST_LEXEME_MODIFIER, $addNumberLexemeAction);
304 * Translate input char to an input symbol of state machine
306 * @param string $char
309 private function _translateInput($char)
311 if (strpos(self::QUERY_WHITE_SPACE_CHARS, $char) !== false) { return self::IN_WHITE_SPACE;
312 } else if (strpos(self::QUERY_SYNT_CHARS, $char) !== false) { return self::IN_SYNT_CHAR;
313 } else if (strpos(self::QUERY_MUTABLE_CHARS, $char) !== false) { return self::IN_MUTABLE_CHAR;
314 } else if (strpos(self::QUERY_LEXEMEMODIFIER_CHARS, $char) !== false) { return self::IN_LEXEME_MODIFIER;
315 } else if (strpos(self::QUERY_ASCIIDIGITS_CHARS, $char) !== false) { return self::IN_ASCII_DIGIT;
316 } else if ($char === '"' ) { return self::IN_QUOTE;
317 } else if ($char === '.' ) { return self::IN_DECIMAL_POINT;
318 } else if ($char === '\\') { return self::IN_ESCAPE_CHAR;
319 } else { return self::IN_CHAR;
325 * This method is used to tokenize query string into lexemes
327 * @param string $inputString
328 * @param string $encoding
330 * @throws Zend_Search_Lucene_Search_QueryParserException
332 public function tokenize($inputString, $encoding)
336 $this->_lexemes = array();
337 $this->_queryString = array();
339 if (PHP_OS == 'AIX' && $encoding == '') {
340 $encoding = 'ISO8859-1';
342 $strLength = iconv_strlen($inputString, $encoding);
344 // Workaround for iconv_substr bug
347 for ($count = 0; $count < $strLength; $count++) {
348 $this->_queryString[$count] = iconv_substr($inputString, $count, 1, $encoding);
351 for ($this->_queryStringPosition = 0;
352 $this->_queryStringPosition < count($this->_queryString);
353 $this->_queryStringPosition++) {
354 $this->process($this->_translateInput($this->_queryString[$this->_queryStringPosition]));
357 $this->process(self::IN_WHITE_SPACE);
359 if ($this->getState() != self::ST_WHITE_SPACE) {
360 require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
361 throw new Zend_Search_Lucene_Search_QueryParserException('Unexpected end of query');
364 $this->_queryString = null;
366 return $this->_lexemes;
371 /*********************************************************************
372 * Actions implementation
374 * Actions affect on recognized lexemes list
375 *********************************************************************/
378 * Add query syntax lexeme
380 * @throws Zend_Search_Lucene_Search_QueryParserException
382 public function addQuerySyntaxLexeme()
384 $lexeme = $this->_queryString[$this->_queryStringPosition];
386 // Process two char lexemes
387 if (strpos(self::QUERY_DOUBLECHARLEXEME_CHARS, $lexeme) !== false) {
388 // increase current position in a query string
389 $this->_queryStringPosition++;
392 if ($this->_queryStringPosition == count($this->_queryString) ||
393 $this->_queryString[$this->_queryStringPosition] != $lexeme) {
394 require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
395 throw new Zend_Search_Lucene_Search_QueryParserException('Two chars lexeme expected. ' . $this->_positionMsg());
398 // duplicate character
402 $token = new Zend_Search_Lucene_Search_QueryToken(
403 Zend_Search_Lucene_Search_QueryToken::TC_SYNTAX_ELEMENT,
405 $this->_queryStringPosition);
407 // Skip this lexeme if it's a field indicator ':' and treat previous as 'field' instead of 'word'
408 if ($token->type == Zend_Search_Lucene_Search_QueryToken::TT_FIELD_INDICATOR) {
409 $token = array_pop($this->_lexemes);
410 if ($token === null || $token->type != Zend_Search_Lucene_Search_QueryToken::TT_WORD) {
411 require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
412 throw new Zend_Search_Lucene_Search_QueryParserException('Field mark \':\' must follow field name. ' . $this->_positionMsg());
415 $token->type = Zend_Search_Lucene_Search_QueryToken::TT_FIELD;
418 $this->_lexemes[] = $token;
422 * Add lexeme modifier
424 public function addLexemeModifier()
426 $this->_lexemes[] = new Zend_Search_Lucene_Search_QueryToken(
427 Zend_Search_Lucene_Search_QueryToken::TC_SYNTAX_ELEMENT,
428 $this->_queryString[$this->_queryStringPosition],
429 $this->_queryStringPosition);
436 public function addLexeme()
438 $this->_lexemes[] = new Zend_Search_Lucene_Search_QueryToken(
439 Zend_Search_Lucene_Search_QueryToken::TC_WORD,
440 $this->_currentLexeme,
441 $this->_queryStringPosition - 1);
443 $this->_currentLexeme = '';
449 public function addQuotedLexeme()
451 $this->_lexemes[] = new Zend_Search_Lucene_Search_QueryToken(
452 Zend_Search_Lucene_Search_QueryToken::TC_PHRASE,
453 $this->_currentLexeme,
454 $this->_queryStringPosition);
456 $this->_currentLexeme = '';
462 public function addNumberLexeme()
464 $this->_lexemes[] = new Zend_Search_Lucene_Search_QueryToken(
465 Zend_Search_Lucene_Search_QueryToken::TC_NUMBER,
466 $this->_currentLexeme,
467 $this->_queryStringPosition - 1);
468 $this->_currentLexeme = '';
472 * Extend lexeme by one char
474 public function addLexemeChar()
476 $this->_currentLexeme .= $this->_queryString[$this->_queryStringPosition];
485 private function _positionMsg()
487 return 'Position is ' . $this->_queryStringPosition . '.';
491 /*********************************************************************
492 * Syntax errors actions
493 *********************************************************************/
494 public function lexModifierErrException()
496 require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
497 throw new Zend_Search_Lucene_Search_QueryParserException('Lexeme modifier character can be followed only by number, white space or query syntax element. ' . $this->_positionMsg());
499 public function quoteWithinLexemeErrException()
501 require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
502 throw new Zend_Search_Lucene_Search_QueryParserException('Quote within lexeme must be escaped by \'\\\' char. ' . $this->_positionMsg());
504 public function wrongNumberErrException()
506 require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
507 throw new Zend_Search_Lucene_Search_QueryParserException('Wrong number syntax.' . $this->_positionMsg());