preprocessing.py 1.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
  1. # Natural Language Toolkit: Language Model Unit Tests
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. from functools import partial
  8. from itertools import chain
  9. from nltk.util import everygrams, pad_sequence
  10. flatten = chain.from_iterable
  11. pad_both_ends = partial(
  12. pad_sequence,
  13. pad_left=True,
  14. left_pad_symbol="<s>",
  15. pad_right=True,
  16. right_pad_symbol="</s>",
  17. )
  18. pad_both_ends.__doc__ = """Pads both ends of a sentence to length specified by ngram order.
  19. Following convention <s> pads the start of sentence </s> pads its end.
  20. """
  21. def padded_everygrams(order, sentence):
  22. """Helper with some useful defaults.
  23. Applies pad_both_ends to sentence and follows it up with everygrams.
  24. """
  25. return everygrams(list(pad_both_ends(sentence, n=order)), max_len=order)
  26. def padded_everygram_pipeline(order, text):
  27. """Default preprocessing for a sequence of sentences.
  28. Creates two iterators:
  29. - sentences padded and turned into sequences of `nltk.util.everygrams`
  30. - sentences padded as above and chained together for a flat stream of words
  31. :param order: Largest ngram length produced by `everygrams`.
  32. :param text: Text to iterate over. Expected to be an iterable of sentences:
  33. Iterable[Iterable[str]]
  34. :return: iterator over text as ngrams, iterator over text as vocabulary data
  35. """
  36. padding_fn = partial(pad_both_ends, n=order)
  37. return (
  38. (everygrams(list(padding_fn(sent)), max_len=order) for sent in text),
  39. flatten(map(padding_fn, text)),
  40. )