search_index.py 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237
  1. import os
  2. import re
  3. import json
  4. import logging
  5. import subprocess
  6. from lunr import lunr
  7. from html.parser import HTMLParser
  8. log = logging.getLogger(__name__)
  9. class SearchIndex:
  10. """
  11. Search index is a collection of pages and sections (heading
  12. tags and their following content are sections).
  13. """
  14. def __init__(self, **config):
  15. self._entries = []
  16. self.config = config
  17. def _find_toc_by_id(self, toc, id_):
  18. """
  19. Given a table of contents and HTML ID, iterate through
  20. and return the matched item in the TOC.
  21. """
  22. for toc_item in toc:
  23. if toc_item.id == id_:
  24. return toc_item
  25. toc_item_r = self._find_toc_by_id(toc_item.children, id_)
  26. if toc_item_r is not None:
  27. return toc_item_r
  28. def _add_entry(self, title, text, loc):
  29. """
  30. A simple wrapper to add an entry and ensure the contents
  31. is UTF8 encoded.
  32. """
  33. text = text.replace('\u00a0', ' ')
  34. text = re.sub(r'[ \t\n\r\f\v]+', ' ', text.strip())
  35. self._entries.append({
  36. 'title': title,
  37. 'text': str(text.encode('utf-8'), encoding='utf-8'),
  38. 'location': loc
  39. })
  40. def add_entry_from_context(self, page):
  41. """
  42. Create a set of entries in the index for a page. One for
  43. the page itself and then one for each of its' heading
  44. tags.
  45. """
  46. # Create the content parser and feed in the HTML for the
  47. # full page. This handles all the parsing and prepares
  48. # us to iterate through it.
  49. parser = ContentParser()
  50. parser.feed(page.content)
  51. parser.close()
  52. # Get the absolute URL for the page, this is then
  53. # prepended to the urls of the sections
  54. url = page.url
  55. # Create an entry for the full page.
  56. self._add_entry(
  57. title=page.title,
  58. text=self.strip_tags(page.content).rstrip('\n'),
  59. loc=url
  60. )
  61. for section in parser.data:
  62. self.create_entry_for_section(section, page.toc, url)
  63. def create_entry_for_section(self, section, toc, abs_url):
  64. """
  65. Given a section on the page, the table of contents and
  66. the absolute url for the page create an entry in the
  67. index
  68. """
  69. toc_item = self._find_toc_by_id(toc, section.id)
  70. if toc_item is not None:
  71. self._add_entry(
  72. title=toc_item.title,
  73. text=" ".join(section.text),
  74. loc=abs_url + toc_item.url
  75. )
  76. def generate_search_index(self):
  77. """python to json conversion"""
  78. page_dicts = {
  79. 'docs': self._entries,
  80. 'config': self.config
  81. }
  82. data = json.dumps(page_dicts, sort_keys=True, separators=(',', ':'))
  83. if self.config['prebuild_index'] in (True, 'node'):
  84. try:
  85. script_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'prebuild-index.js')
  86. p = subprocess.Popen(
  87. ['node', script_path],
  88. stdin=subprocess.PIPE,
  89. stdout=subprocess.PIPE,
  90. stderr=subprocess.PIPE
  91. )
  92. idx, err = p.communicate(data.encode('utf-8'))
  93. if not err:
  94. idx = idx.decode('utf-8') if hasattr(idx, 'decode') else idx
  95. page_dicts['index'] = json.loads(idx)
  96. data = json.dumps(page_dicts, sort_keys=True, separators=(',', ':'))
  97. log.debug('Pre-built search index created successfully.')
  98. else:
  99. log.warning('Failed to pre-build search index. Error: {}'.format(err))
  100. except (OSError, ValueError) as e:
  101. log.warning('Failed to pre-build search index. Error: {}'.format(e))
  102. elif self.config['prebuild_index'] == 'python':
  103. idx = lunr(
  104. ref='location', fields=('title', 'text'), documents=self._entries,
  105. languages=self.config['lang'])
  106. page_dicts['index'] = idx.serialize()
  107. data = json.dumps(page_dicts, sort_keys=True, separators=(',', ':'))
  108. return data
  109. def strip_tags(self, html):
  110. """strip html tags from data"""
  111. s = HTMLStripper()
  112. s.feed(html)
  113. return s.get_data()
  114. class HTMLStripper(HTMLParser):
  115. """
  116. A simple HTML parser that stores all of the data within tags
  117. but ignores the tags themselves and thus strips them from the
  118. content.
  119. """
  120. def __init__(self, *args, **kwargs):
  121. super().__init__(*args, **kwargs)
  122. self.data = []
  123. def handle_data(self, d):
  124. """
  125. Called for the text contents of each tag.
  126. """
  127. self.data.append(d)
  128. def get_data(self):
  129. return '\n'.join(self.data)
  130. class ContentSection:
  131. """
  132. Used by the ContentParser class to capture the information we
  133. need when it is parsing the HMTL.
  134. """
  135. def __init__(self, text=None, id_=None, title=None):
  136. self.text = text or []
  137. self.id = id_
  138. self.title = title
  139. def __eq__(self, other):
  140. return all([
  141. self.text == other.text,
  142. self.id == other.id,
  143. self.title == other.title
  144. ])
  145. class ContentParser(HTMLParser):
  146. """
  147. Given a block of HTML, group the content under the preceding
  148. heading tags which can then be used for creating an index
  149. for that section.
  150. """
  151. def __init__(self, *args, **kwargs):
  152. super().__init__(*args, **kwargs)
  153. self.data = []
  154. self.section = None
  155. self.is_header_tag = False
  156. def handle_starttag(self, tag, attrs):
  157. """Called at the start of every HTML tag."""
  158. # We only care about the opening tag for headings.
  159. if tag not in (["h%d" % x for x in range(1, 7)]):
  160. return
  161. # We are dealing with a new header, create a new section
  162. # for it and assign the ID if it has one.
  163. self.is_header_tag = True
  164. self.section = ContentSection()
  165. self.data.append(self.section)
  166. for attr in attrs:
  167. if attr[0] == "id":
  168. self.section.id = attr[1]
  169. def handle_endtag(self, tag):
  170. """Called at the end of every HTML tag."""
  171. # We only care about the opening tag for headings.
  172. if tag not in (["h%d" % x for x in range(1, 7)]):
  173. return
  174. self.is_header_tag = False
  175. def handle_data(self, data):
  176. """
  177. Called for the text contents of each tag.
  178. """
  179. if self.section is None:
  180. # This means we have some content at the start of the
  181. # HTML before we reach a heading tag. We don't actually
  182. # care about that content as it will be added to the
  183. # overall page entry in the search. So just skip it.
  184. return
  185. # If this is a header, then the data is the title.
  186. # Otherwise it is content of something under that header
  187. # section.
  188. if self.is_header_tag:
  189. self.section.title = data
  190. else:
  191. self.section.text.append(data.rstrip('\n'))