api.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: Twitter API
  3. #
  4. # Copyright (C) 2001-2020 NLTK Project
  5. # Author: Ewan Klein <ewan@inf.ed.ac.uk>
  6. # Lorenzo Rubio <lrnzcig@gmail.com>
  7. # URL: <http://nltk.org/>
  8. # For license information, see LICENSE.TXT
  9. """
  10. This module provides an interface for TweetHandlers, and support for timezone
  11. handling.
  12. """
  13. import time as _time
  14. from abc import ABCMeta, abstractmethod
  15. from datetime import tzinfo, timedelta, timezone, datetime
  16. class LocalTimezoneOffsetWithUTC(tzinfo):
  17. """
  18. This is not intended to be a general purpose class for dealing with the
  19. local timezone. In particular:
  20. * it assumes that the date passed has been created using
  21. `datetime(..., tzinfo=Local)`, where `Local` is an instance of
  22. the object `LocalTimezoneOffsetWithUTC`;
  23. * for such an object, it returns the offset with UTC, used for date comparisons.
  24. Reference: https://docs.python.org/3/library/datetime.html
  25. """
  26. STDOFFSET = timedelta(seconds=-_time.timezone)
  27. if _time.daylight:
  28. DSTOFFSET = timedelta(seconds=-_time.altzone)
  29. else:
  30. DSTOFFSET = STDOFFSET
  31. def utcoffset(self, dt):
  32. """
  33. Access the relevant time offset.
  34. """
  35. return self.DSTOFFSET
  36. LOCAL = LocalTimezoneOffsetWithUTC()
  37. class BasicTweetHandler(metaclass=ABCMeta):
  38. """
  39. Minimal implementation of `TweetHandler`.
  40. Counts the number of Tweets and decides when the client should stop
  41. fetching them.
  42. """
  43. def __init__(self, limit=20):
  44. self.limit = limit
  45. self.counter = 0
  46. """
  47. A flag to indicate to the client whether to stop fetching data given
  48. some condition (e.g., reaching a date limit).
  49. """
  50. self.do_stop = False
  51. """
  52. Stores the id of the last fetched Tweet to handle pagination.
  53. """
  54. self.max_id = None
  55. def do_continue(self):
  56. """
  57. Returns `False` if the client should stop fetching Tweets.
  58. """
  59. return self.counter < self.limit and not self.do_stop
  60. class TweetHandlerI(BasicTweetHandler):
  61. """
  62. Interface class whose subclasses should implement a handle method that
  63. Twitter clients can delegate to.
  64. """
  65. def __init__(self, limit=20, upper_date_limit=None, lower_date_limit=None):
  66. """
  67. :param int limit: The number of data items to process in the current\
  68. round of processing.
  69. :param tuple upper_date_limit: The date at which to stop collecting\
  70. new data. This should be entered as a tuple which can serve as the\
  71. argument to `datetime.datetime`.\
  72. E.g. `date_limit=(2015, 4, 1, 12, 40)` for 12:30 pm on April 1 2015.
  73. :param tuple lower_date_limit: The date at which to stop collecting\
  74. new data. See `upper_data_limit` for formatting.
  75. """
  76. BasicTweetHandler.__init__(self, limit)
  77. self.upper_date_limit = None
  78. self.lower_date_limit = None
  79. if upper_date_limit:
  80. self.upper_date_limit = datetime(*upper_date_limit, tzinfo=LOCAL)
  81. if lower_date_limit:
  82. self.lower_date_limit = datetime(*lower_date_limit, tzinfo=LOCAL)
  83. self.startingup = True
  84. @abstractmethod
  85. def handle(self, data):
  86. """
  87. Deal appropriately with data returned by the Twitter API
  88. """
  89. @abstractmethod
  90. def on_finish(self):
  91. """
  92. Actions when the tweet limit has been reached
  93. """
  94. def check_date_limit(self, data, verbose=False):
  95. """
  96. Validate date limits.
  97. """
  98. if self.upper_date_limit or self.lower_date_limit:
  99. date_fmt = "%a %b %d %H:%M:%S +0000 %Y"
  100. tweet_date = datetime.strptime(data["created_at"], date_fmt).replace(
  101. tzinfo=timezone.utc
  102. )
  103. if (self.upper_date_limit and tweet_date > self.upper_date_limit) or (
  104. self.lower_date_limit and tweet_date < self.lower_date_limit
  105. ):
  106. if self.upper_date_limit:
  107. message = "earlier"
  108. date_limit = self.upper_date_limit
  109. else:
  110. message = "later"
  111. date_limit = self.lower_date_limit
  112. if verbose:
  113. print(
  114. "Date limit {0} is {1} than date of current tweet {2}".format(
  115. date_limit, message, tweet_date
  116. )
  117. )
  118. self.do_stop = True