query_lexer.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155
  1. from __future__ import unicode_literals
  2. from lunr.tokenizer import default_separator
  3. class QueryLexer:
  4. # TODO: use iteration protocol?
  5. EOS = "EOS"
  6. FIELD = "FIELD"
  7. TERM = "TERM"
  8. EDIT_DISTANCE = "EDIT_DISTANCE"
  9. BOOST = "BOOST"
  10. PRESENCE = "PRESENCE"
  11. def __init__(self, string):
  12. self.lexemes = []
  13. self.string = string
  14. self.length = len(string)
  15. self.pos = 0
  16. self.start = 0
  17. self.escape_char_positions = []
  18. @property
  19. def width(self):
  20. return self.pos - self.start
  21. def ignore(self):
  22. if self.start == self.pos:
  23. self.pos += 1
  24. self.start = self.pos
  25. def backup(self):
  26. self.pos -= 1
  27. def accept_digit_run(self):
  28. char = self.next()
  29. while char != self.EOS and (47 < ord(char) < 58):
  30. char = self.next()
  31. if char != self.EOS:
  32. self.backup()
  33. def run(self):
  34. state = self.lex_text()
  35. while state:
  36. state = state()
  37. def slice_string(self):
  38. subslices = []
  39. slice_start = self.start
  40. for escape_char_position in self.escape_char_positions:
  41. subslices.append(self.string[slice_start:escape_char_position])
  42. slice_start = escape_char_position + 1
  43. subslices.append(self.string[slice_start : self.pos])
  44. self.escape_char_positions = []
  45. return "".join(subslices)
  46. def next(self):
  47. if self.pos >= self.length:
  48. return self.EOS
  49. char = self.string[self.pos]
  50. self.pos += 1
  51. return char
  52. def emit(self, type_):
  53. self.lexemes.append(
  54. {
  55. "type": type_,
  56. "string": self.slice_string(),
  57. "start": self.start,
  58. "end": self.pos,
  59. }
  60. )
  61. self.start = self.pos
  62. def escape_character(self):
  63. self.escape_char_positions.append(self.pos - 1)
  64. self.pos += 1
  65. def lex_field(self):
  66. self.backup()
  67. self.emit(self.FIELD)
  68. self.ignore()
  69. return self.lex_text
  70. def lex_term(self):
  71. if self.width > 1:
  72. self.backup()
  73. self.emit(self.TERM)
  74. self.ignore()
  75. return self.lex_text
  76. def lex_edit_distance(self):
  77. self.ignore()
  78. self.accept_digit_run()
  79. self.emit(self.EDIT_DISTANCE)
  80. return self.lex_text
  81. def lex_boost(self):
  82. self.ignore()
  83. self.accept_digit_run()
  84. self.emit(self.BOOST)
  85. return self.lex_text
  86. def lex_EOS(self):
  87. if self.width > 0:
  88. self.emit(self.TERM)
  89. def lex_text(self):
  90. while True:
  91. char = self.next()
  92. if char == self.EOS:
  93. return self.lex_EOS
  94. if ord(char) == 92: # Escape character is '\'
  95. self.escape_character()
  96. continue
  97. if char == ":":
  98. return self.lex_field
  99. if char == "~":
  100. self.backup()
  101. if self.width > 0:
  102. self.emit(self.TERM)
  103. return self.lex_edit_distance
  104. if char == "^":
  105. self.backup()
  106. if self.width > 0:
  107. self.emit(self.TERM)
  108. return self.lex_boost
  109. # '+' indicates term presence is required, check for length to
  110. # ensure only a leading '+' is considered
  111. if char == "+" and self.width == 1:
  112. self.emit(self.PRESENCE)
  113. return self.lex_text
  114. # '-' indicates term presence is prohibited
  115. if char == "-" and self.width == 1:
  116. self.emit(self.PRESENCE)
  117. return self.lex_text
  118. if default_separator(char):
  119. return self.lex_term