You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
152 lines
4.5 KiB
Python
152 lines
4.5 KiB
Python
5 years ago
|
# -*- coding: utf-8 -*-
|
||
|
# Natural Language Toolkit: Twitter API
|
||
|
#
|
||
|
# Copyright (C) 2001-2019 NLTK Project
|
||
|
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
|
||
|
# Lorenzo Rubio <lrnzcig@gmail.com>
|
||
|
# URL: <http://nltk.org/>
|
||
|
# For license information, see LICENSE.TXT
|
||
|
|
||
|
"""
|
||
|
This module provides an interface for TweetHandlers, and support for timezone
|
||
|
handling.
|
||
|
"""
|
||
|
|
||
|
import time as _time
|
||
|
from abc import ABCMeta, abstractmethod
|
||
|
from datetime import tzinfo, timedelta, datetime
|
||
|
|
||
|
from six import add_metaclass
|
||
|
|
||
|
from nltk.compat import UTC
|
||
|
|
||
|
|
||
|
class LocalTimezoneOffsetWithUTC(tzinfo):
|
||
|
"""
|
||
|
This is not intended to be a general purpose class for dealing with the
|
||
|
local timezone. In particular:
|
||
|
|
||
|
* it assumes that the date passed has been created using
|
||
|
`datetime(..., tzinfo=Local)`, where `Local` is an instance of
|
||
|
the object `LocalTimezoneOffsetWithUTC`;
|
||
|
* for such an object, it returns the offset with UTC, used for date comparisons.
|
||
|
|
||
|
Reference: https://docs.python.org/3/library/datetime.html
|
||
|
"""
|
||
|
|
||
|
STDOFFSET = timedelta(seconds=-_time.timezone)
|
||
|
|
||
|
if _time.daylight:
|
||
|
DSTOFFSET = timedelta(seconds=-_time.altzone)
|
||
|
else:
|
||
|
DSTOFFSET = STDOFFSET
|
||
|
|
||
|
def utcoffset(self, dt):
|
||
|
"""
|
||
|
Access the relevant time offset.
|
||
|
"""
|
||
|
return self.DSTOFFSET
|
||
|
|
||
|
|
||
|
LOCAL = LocalTimezoneOffsetWithUTC()
|
||
|
|
||
|
|
||
|
@add_metaclass(ABCMeta)
|
||
|
class BasicTweetHandler(object):
|
||
|
"""
|
||
|
Minimal implementation of `TweetHandler`.
|
||
|
|
||
|
Counts the number of Tweets and decides when the client should stop
|
||
|
fetching them.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, limit=20):
|
||
|
self.limit = limit
|
||
|
self.counter = 0
|
||
|
|
||
|
"""
|
||
|
A flag to indicate to the client whether to stop fetching data given
|
||
|
some condition (e.g., reaching a date limit).
|
||
|
"""
|
||
|
self.do_stop = False
|
||
|
|
||
|
"""
|
||
|
Stores the id of the last fetched Tweet to handle pagination.
|
||
|
"""
|
||
|
self.max_id = None
|
||
|
|
||
|
def do_continue(self):
|
||
|
"""
|
||
|
Returns `False` if the client should stop fetching Tweets.
|
||
|
"""
|
||
|
return self.counter < self.limit and not self.do_stop
|
||
|
|
||
|
|
||
|
class TweetHandlerI(BasicTweetHandler):
|
||
|
"""
|
||
|
Interface class whose subclasses should implement a handle method that
|
||
|
Twitter clients can delegate to.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, limit=20, upper_date_limit=None, lower_date_limit=None):
|
||
|
"""
|
||
|
:param int limit: The number of data items to process in the current\
|
||
|
round of processing.
|
||
|
|
||
|
:param tuple upper_date_limit: The date at which to stop collecting\
|
||
|
new data. This should be entered as a tuple which can serve as the\
|
||
|
argument to `datetime.datetime`.\
|
||
|
E.g. `date_limit=(2015, 4, 1, 12, 40)` for 12:30 pm on April 1 2015.
|
||
|
|
||
|
:param tuple lower_date_limit: The date at which to stop collecting\
|
||
|
new data. See `upper_data_limit` for formatting.
|
||
|
"""
|
||
|
BasicTweetHandler.__init__(self, limit)
|
||
|
|
||
|
self.upper_date_limit = None
|
||
|
self.lower_date_limit = None
|
||
|
if upper_date_limit:
|
||
|
self.upper_date_limit = datetime(*upper_date_limit, tzinfo=LOCAL)
|
||
|
if lower_date_limit:
|
||
|
self.lower_date_limit = datetime(*lower_date_limit, tzinfo=LOCAL)
|
||
|
|
||
|
self.startingup = True
|
||
|
|
||
|
@abstractmethod
|
||
|
def handle(self, data):
|
||
|
"""
|
||
|
Deal appropriately with data returned by the Twitter API
|
||
|
"""
|
||
|
|
||
|
@abstractmethod
|
||
|
def on_finish(self):
|
||
|
"""
|
||
|
Actions when the tweet limit has been reached
|
||
|
"""
|
||
|
|
||
|
def check_date_limit(self, data, verbose=False):
|
||
|
"""
|
||
|
Validate date limits.
|
||
|
"""
|
||
|
if self.upper_date_limit or self.lower_date_limit:
|
||
|
date_fmt = '%a %b %d %H:%M:%S +0000 %Y'
|
||
|
tweet_date = datetime.strptime(data['created_at'], date_fmt).replace(
|
||
|
tzinfo=UTC
|
||
|
)
|
||
|
if (self.upper_date_limit and tweet_date > self.upper_date_limit) or (
|
||
|
self.lower_date_limit and tweet_date < self.lower_date_limit
|
||
|
):
|
||
|
if self.upper_date_limit:
|
||
|
message = "earlier"
|
||
|
date_limit = self.upper_date_limit
|
||
|
else:
|
||
|
message = "later"
|
||
|
date_limit = self.lower_date_limit
|
||
|
if verbose:
|
||
|
print(
|
||
|
"Date limit {0} is {1} than date of current tweet {2}".format(
|
||
|
date_limit, message, tweet_date
|
||
|
)
|
||
|
)
|
||
|
self.do_stop = True
|