special.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. # -*- coding: utf-8 -*-
  2. """
  3. pygments.lexers.special
  4. ~~~~~~~~~~~~~~~~~~~~~~~
  5. Special lexers.
  6. :copyright: Copyright 2006-2019 by the Pygments team, see AUTHORS.
  7. :license: BSD, see LICENSE for details.
  8. """
  9. import re
  10. from io import BytesIO
  11. from pygments.lexer import Lexer
  12. from pygments.token import Token, Error, Text
  13. from pygments.util import get_choice_opt
  14. __all__ = ['TextLexer', 'RawTokenLexer']
  15. class TextLexer(Lexer):
  16. """
  17. "Null" lexer, doesn't highlight anything.
  18. """
  19. name = 'Text only'
  20. aliases = ['text']
  21. filenames = ['*.txt']
  22. mimetypes = ['text/plain']
  23. priority = 0.01
  24. def get_tokens_unprocessed(self, text):
  25. yield 0, Text, text
  26. def analyse_text(text):
  27. return TextLexer.priority
  28. _ttype_cache = {}
  29. line_re = re.compile(b'.*?\n')
  30. class RawTokenLexer(Lexer):
  31. """
  32. Recreate a token stream formatted with the `RawTokenFormatter`. This
  33. lexer raises exceptions during parsing if the token stream in the
  34. file is malformed.
  35. Additional options accepted:
  36. `compress`
  37. If set to ``"gz"`` or ``"bz2"``, decompress the token stream with
  38. the given compression algorithm before lexing (default: ``""``).
  39. """
  40. name = 'Raw token data'
  41. aliases = ['raw']
  42. filenames = []
  43. mimetypes = ['application/x-pygments-tokens']
  44. def __init__(self, **options):
  45. self.compress = get_choice_opt(options, 'compress',
  46. ['', 'none', 'gz', 'bz2'], '')
  47. Lexer.__init__(self, **options)
  48. def get_tokens(self, text):
  49. if isinstance(text, str):
  50. # raw token stream never has any non-ASCII characters
  51. text = text.encode('ascii')
  52. if self.compress == 'gz':
  53. import gzip
  54. gzipfile = gzip.GzipFile('', 'rb', 9, BytesIO(text))
  55. text = gzipfile.read()
  56. elif self.compress == 'bz2':
  57. import bz2
  58. text = bz2.decompress(text)
  59. # do not call Lexer.get_tokens() because we do not want Unicode
  60. # decoding to occur, and stripping is not optional.
  61. text = text.strip(b'\n') + b'\n'
  62. for i, t, v in self.get_tokens_unprocessed(text):
  63. yield t, v
  64. def get_tokens_unprocessed(self, text):
  65. length = 0
  66. for match in line_re.finditer(text):
  67. try:
  68. ttypestr, val = match.group().split(b'\t', 1)
  69. except ValueError:
  70. val = match.group().decode('ascii', 'replace')
  71. ttype = Error
  72. else:
  73. ttype = _ttype_cache.get(ttypestr)
  74. if not ttype:
  75. ttype = Token
  76. ttypes = ttypestr.split('.')[1:]
  77. for ttype_ in ttypes:
  78. if not ttype_ or not ttype_[0].isupper():
  79. raise ValueError('malformed token name')
  80. ttype = getattr(ttype, ttype_)
  81. _ttype_cache[ttypestr] = ttype
  82. val = val[2:-2].decode('unicode-escape')
  83. yield length, ttype, val
  84. length += len(val)