cli.py 1.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: NLTK Command-Line Interface
  3. #
  4. # Copyright (C) 2001-2020 NLTK Project
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. from functools import partial
  8. from itertools import chain
  9. from tqdm import tqdm
  10. import click
  11. from nltk import word_tokenize
  12. from nltk.util import parallelize_preprocess
  13. CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
  14. @click.group(context_settings=CONTEXT_SETTINGS)
  15. @click.version_option()
  16. def cli():
  17. pass
  18. @cli.command("tokenize")
  19. @click.option(
  20. "--language",
  21. "-l",
  22. default="en",
  23. help="The language for the Punkt sentence tokenization.",
  24. )
  25. @click.option(
  26. "--preserve-line",
  27. "-l",
  28. default=True,
  29. is_flag=True,
  30. help="An option to keep the preserve the sentence and not sentence tokenize it.",
  31. )
  32. @click.option("--processes", "-j", default=1, help="No. of processes.")
  33. @click.option("--encoding", "-e", default="utf8", help="Specify encoding of file.")
  34. @click.option(
  35. "--delimiter", "-d", default=" ", help="Specify delimiter to join the tokens."
  36. )
  37. def tokenize_file(language, preserve_line, processes, encoding, delimiter):
  38. """ This command tokenizes text stream using nltk.word_tokenize """
  39. with click.get_text_stream("stdin", encoding=encoding) as fin:
  40. with click.get_text_stream("stdout", encoding=encoding) as fout:
  41. # If it's single process, joblib parallization is slower,
  42. # so just process line by line normally.
  43. if processes == 1:
  44. for line in tqdm(fin.readlines()):
  45. print(delimiter.join(word_tokenize(line)), end="\n", file=fout)
  46. else:
  47. for outline in parallelize_preprocess(
  48. word_tokenize, fin.readlines(), processes, progress_bar=True
  49. ):
  50. print(delimiter.join(outline), end="\n", file=fout)