test_seekable_unicode_stream_reader.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. # -*- coding: utf-8 -*-
  2. """
  3. The following test performs a random series of reads, seeks, and
  4. tells, and checks that the results are consistent.
  5. """
  6. import random
  7. import functools
  8. from io import BytesIO
  9. from nltk.corpus.reader import SeekableUnicodeStreamReader
  10. def check_reader(unicode_string, encoding, n=1000):
  11. bytestr = unicode_string.encode(encoding)
  12. strlen = len(unicode_string)
  13. stream = BytesIO(bytestr)
  14. reader = SeekableUnicodeStreamReader(stream, encoding)
  15. # Find all character positions
  16. chars = []
  17. while True:
  18. pos = reader.tell()
  19. chars.append((pos, reader.read(1)))
  20. if chars[-1][1] == '':
  21. break
  22. # Find all strings
  23. strings = dict((pos, '') for (pos, c) in chars)
  24. for pos1, char in chars:
  25. for pos2, _ in chars:
  26. if pos2 <= pos1:
  27. strings[pos2] += char
  28. while True:
  29. op = random.choice('tsrr')
  30. # Check our position?
  31. if op == 't': # tell
  32. reader.tell()
  33. # Perform a seek?
  34. if op == 's': # seek
  35. new_pos = random.choice([p for (p, c) in chars])
  36. reader.seek(new_pos)
  37. # Perform a read?
  38. if op == 'r': # read
  39. if random.random() < 0.3:
  40. pos = reader.tell()
  41. else:
  42. pos = None
  43. if random.random() < 0.2:
  44. size = None
  45. elif random.random() < 0.8:
  46. size = random.randint(0, int(strlen / 6))
  47. else:
  48. size = random.randint(0, strlen + 20)
  49. if random.random() < 0.8:
  50. s = reader.read(size)
  51. else:
  52. s = reader.readline(size)
  53. # check that everything's consistent
  54. if pos is not None:
  55. assert pos in strings
  56. assert strings[pos].startswith(s)
  57. n -= 1
  58. if n == 0:
  59. return 'passed'
  60. # Call the randomized test function `check_reader` with a variety of
  61. # input strings and encodings.
  62. ENCODINGS = ['ascii', 'latin1', 'greek', 'hebrew', 'utf-16', 'utf-8']
  63. STRINGS = [
  64. """
  65. This is a test file.
  66. It is fairly short.
  67. """,
  68. "This file can be encoded with latin1. \x83",
  69. """\
  70. This is a test file.
  71. Here's a blank line:
  72. And here's some unicode: \xee \u0123 \uffe3
  73. """,
  74. """\
  75. This is a test file.
  76. Unicode characters: \xf3 \u2222 \u3333\u4444 \u5555
  77. """,
  78. ]
  79. def test_reader():
  80. for string in STRINGS:
  81. for encoding in ENCODINGS:
  82. try:
  83. # skip strings that can't be encoded with the current encoding
  84. string.encode(encoding)
  85. yield check_reader, string, encoding
  86. except UnicodeEncodeError:
  87. pass
  88. # nose shows the whole string arguments in a verbose mode; this is annoying,
  89. # so large string test is separated.
  90. LARGE_STRING = (
  91. """\
  92. This is a larger file. It has some lines that are longer \
  93. than 72 characters. It's got lots of repetition. Here's \
  94. some unicode chars: \xee \u0123 \uffe3 \ueeee \u2345
  95. How fun! Let's repeat it twenty times.
  96. """
  97. * 10
  98. )
  99. def test_reader_on_large_string():
  100. for encoding in ENCODINGS:
  101. try:
  102. # skip strings that can't be encoded with the current encoding
  103. LARGE_STRING.encode(encoding)
  104. def _check(encoding, n=1000):
  105. check_reader(LARGE_STRING, encoding, n)
  106. yield _check, encoding
  107. except UnicodeEncodeError:
  108. pass
  109. def test_reader_stream_is_closed():
  110. reader = SeekableUnicodeStreamReader(BytesIO(b''), 'ascii')
  111. assert reader.stream.closed is False
  112. reader.__del__()
  113. assert reader.stream.closed is True
  114. def teardown_module(module=None):
  115. import gc
  116. gc.collect()