twitter.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. # Natural Language Toolkit: Twitter Corpus Reader
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: Ewan Klein <ewan@inf.ed.ac.uk>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. """
  8. A reader for corpora that consist of Tweets. It is assumed that the Tweets
  9. have been serialised into line-delimited JSON.
  10. """
  11. import json
  12. import os
  13. from nltk.tokenize import TweetTokenizer
  14. from nltk.corpus.reader.util import StreamBackedCorpusView, concat, ZipFilePathPointer
  15. from nltk.corpus.reader.api import CorpusReader
  16. class TwitterCorpusReader(CorpusReader):
  17. """
  18. Reader for corpora that consist of Tweets represented as a list of line-delimited JSON.
  19. Individual Tweets can be tokenized using the default tokenizer, or by a
  20. custom tokenizer specified as a parameter to the constructor.
  21. Construct a new Tweet corpus reader for a set of documents
  22. located at the given root directory.
  23. If you made your own tweet collection in a directory called
  24. `twitter-files`, then you can initialise the reader as::
  25. from nltk.corpus import TwitterCorpusReader
  26. reader = TwitterCorpusReader(root='/path/to/twitter-files', '.*\.json')
  27. However, the recommended approach is to set the relevant directory as the
  28. value of the environmental variable `TWITTER`, and then invoke the reader
  29. as follows::
  30. root = os.environ['TWITTER']
  31. reader = TwitterCorpusReader(root, '.*\.json')
  32. If you want to work directly with the raw Tweets, the `json` library can
  33. be used::
  34. import json
  35. for tweet in reader.docs():
  36. print(json.dumps(tweet, indent=1, sort_keys=True))
  37. """
  38. CorpusView = StreamBackedCorpusView
  39. """
  40. The corpus view class used by this reader.
  41. """
  42. def __init__(
  43. self, root, fileids=None, word_tokenizer=TweetTokenizer(), encoding="utf8"
  44. ):
  45. """
  46. :param root: The root directory for this corpus.
  47. :param fileids: A list or regexp specifying the fileids in this corpus.
  48. :param word_tokenizer: Tokenizer for breaking the text of Tweets into
  49. smaller units, including but not limited to words.
  50. """
  51. CorpusReader.__init__(self, root, fileids, encoding)
  52. for path in self.abspaths(self._fileids):
  53. if isinstance(path, ZipFilePathPointer):
  54. pass
  55. elif os.path.getsize(path) == 0:
  56. raise ValueError("File {} is empty".format(path))
  57. """Check that all user-created corpus files are non-empty."""
  58. self._word_tokenizer = word_tokenizer
  59. def docs(self, fileids=None):
  60. """
  61. Returns the full Tweet objects, as specified by `Twitter
  62. documentation on Tweets
  63. <https://dev.twitter.com/docs/platform-objects/tweets>`_
  64. :return: the given file(s) as a list of dictionaries deserialised
  65. from JSON.
  66. :rtype: list(dict)
  67. """
  68. return concat(
  69. [
  70. self.CorpusView(path, self._read_tweets, encoding=enc)
  71. for (path, enc, fileid) in self.abspaths(fileids, True, True)
  72. ]
  73. )
  74. def strings(self, fileids=None):
  75. """
  76. Returns only the text content of Tweets in the file(s)
  77. :return: the given file(s) as a list of Tweets.
  78. :rtype: list(str)
  79. """
  80. fulltweets = self.docs(fileids)
  81. tweets = []
  82. for jsono in fulltweets:
  83. try:
  84. text = jsono["text"]
  85. if isinstance(text, bytes):
  86. text = text.decode(self.encoding)
  87. tweets.append(text)
  88. except KeyError:
  89. pass
  90. return tweets
  91. def tokenized(self, fileids=None):
  92. """
  93. :return: the given file(s) as a list of the text content of Tweets as
  94. as a list of words, screenanames, hashtags, URLs and punctuation symbols.
  95. :rtype: list(list(str))
  96. """
  97. tweets = self.strings(fileids)
  98. tokenizer = self._word_tokenizer
  99. return [tokenizer.tokenize(t) for t in tweets]
  100. def raw(self, fileids=None):
  101. """
  102. Return the corpora in their raw form.
  103. """
  104. if fileids is None:
  105. fileids = self._fileids
  106. elif isinstance(fileids, str):
  107. fileids = [fileids]
  108. return concat([self.open(f).read() for f in fileids])
  109. def _read_tweets(self, stream):
  110. """
  111. Assumes that each line in ``stream`` is a JSON-serialised object.
  112. """
  113. tweets = []
  114. for i in range(10):
  115. line = stream.readline()
  116. if not line:
  117. return tweets
  118. tweet = json.loads(line)
  119. tweets.append(tweet)
  120. return tweets