You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
152 lines
4.5 KiB
Python
152 lines
4.5 KiB
Python
# -*- coding: utf-8 -*-
|
|
# Natural Language Toolkit: Twitter API
|
|
#
|
|
# Copyright (C) 2001-2019 NLTK Project
|
|
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
|
|
# Lorenzo Rubio <lrnzcig@gmail.com>
|
|
# URL: <http://nltk.org/>
|
|
# For license information, see LICENSE.TXT
|
|
|
|
"""
|
|
This module provides an interface for TweetHandlers, and support for timezone
|
|
handling.
|
|
"""
|
|
|
|
import time as _time
|
|
from abc import ABCMeta, abstractmethod
|
|
from datetime import tzinfo, timedelta, datetime
|
|
|
|
from six import add_metaclass
|
|
|
|
from nltk.compat import UTC
|
|
|
|
|
|
class LocalTimezoneOffsetWithUTC(tzinfo):
|
|
"""
|
|
This is not intended to be a general purpose class for dealing with the
|
|
local timezone. In particular:
|
|
|
|
* it assumes that the date passed has been created using
|
|
`datetime(..., tzinfo=Local)`, where `Local` is an instance of
|
|
the object `LocalTimezoneOffsetWithUTC`;
|
|
* for such an object, it returns the offset with UTC, used for date comparisons.
|
|
|
|
Reference: https://docs.python.org/3/library/datetime.html
|
|
"""
|
|
|
|
STDOFFSET = timedelta(seconds=-_time.timezone)
|
|
|
|
if _time.daylight:
|
|
DSTOFFSET = timedelta(seconds=-_time.altzone)
|
|
else:
|
|
DSTOFFSET = STDOFFSET
|
|
|
|
def utcoffset(self, dt):
|
|
"""
|
|
Access the relevant time offset.
|
|
"""
|
|
return self.DSTOFFSET
|
|
|
|
|
|
LOCAL = LocalTimezoneOffsetWithUTC()
|
|
|
|
|
|
@add_metaclass(ABCMeta)
|
|
class BasicTweetHandler(object):
|
|
"""
|
|
Minimal implementation of `TweetHandler`.
|
|
|
|
Counts the number of Tweets and decides when the client should stop
|
|
fetching them.
|
|
"""
|
|
|
|
def __init__(self, limit=20):
|
|
self.limit = limit
|
|
self.counter = 0
|
|
|
|
"""
|
|
A flag to indicate to the client whether to stop fetching data given
|
|
some condition (e.g., reaching a date limit).
|
|
"""
|
|
self.do_stop = False
|
|
|
|
"""
|
|
Stores the id of the last fetched Tweet to handle pagination.
|
|
"""
|
|
self.max_id = None
|
|
|
|
def do_continue(self):
|
|
"""
|
|
Returns `False` if the client should stop fetching Tweets.
|
|
"""
|
|
return self.counter < self.limit and not self.do_stop
|
|
|
|
|
|
class TweetHandlerI(BasicTweetHandler):
|
|
"""
|
|
Interface class whose subclasses should implement a handle method that
|
|
Twitter clients can delegate to.
|
|
"""
|
|
|
|
def __init__(self, limit=20, upper_date_limit=None, lower_date_limit=None):
|
|
"""
|
|
:param int limit: The number of data items to process in the current\
|
|
round of processing.
|
|
|
|
:param tuple upper_date_limit: The date at which to stop collecting\
|
|
new data. This should be entered as a tuple which can serve as the\
|
|
argument to `datetime.datetime`.\
|
|
E.g. `date_limit=(2015, 4, 1, 12, 40)` for 12:30 pm on April 1 2015.
|
|
|
|
:param tuple lower_date_limit: The date at which to stop collecting\
|
|
new data. See `upper_data_limit` for formatting.
|
|
"""
|
|
BasicTweetHandler.__init__(self, limit)
|
|
|
|
self.upper_date_limit = None
|
|
self.lower_date_limit = None
|
|
if upper_date_limit:
|
|
self.upper_date_limit = datetime(*upper_date_limit, tzinfo=LOCAL)
|
|
if lower_date_limit:
|
|
self.lower_date_limit = datetime(*lower_date_limit, tzinfo=LOCAL)
|
|
|
|
self.startingup = True
|
|
|
|
@abstractmethod
|
|
def handle(self, data):
|
|
"""
|
|
Deal appropriately with data returned by the Twitter API
|
|
"""
|
|
|
|
@abstractmethod
|
|
def on_finish(self):
|
|
"""
|
|
Actions when the tweet limit has been reached
|
|
"""
|
|
|
|
def check_date_limit(self, data, verbose=False):
|
|
"""
|
|
Validate date limits.
|
|
"""
|
|
if self.upper_date_limit or self.lower_date_limit:
|
|
date_fmt = '%a %b %d %H:%M:%S +0000 %Y'
|
|
tweet_date = datetime.strptime(data['created_at'], date_fmt).replace(
|
|
tzinfo=UTC
|
|
)
|
|
if (self.upper_date_limit and tweet_date > self.upper_date_limit) or (
|
|
self.lower_date_limit and tweet_date < self.lower_date_limit
|
|
):
|
|
if self.upper_date_limit:
|
|
message = "earlier"
|
|
date_limit = self.upper_date_limit
|
|
else:
|
|
message = "later"
|
|
date_limit = self.lower_date_limit
|
|
if verbose:
|
|
print(
|
|
"Date limit {0} is {1} than date of current tweet {2}".format(
|
|
date_limit, message, tweet_date
|
|
)
|
|
)
|
|
self.do_stop = True
|