Source code for mongoUtils.pubsub

"""publish and/or subscribe to a collection 
"""

from datetime import datetime
from pymongo.errors import AutoReconnect
from pymongo.cursor import CursorType
from pymongo import collection, ReturnDocument
from bson.objectid import ObjectId
from bson import SON, CodecOptions
from time import sleep
from mongoUtils.helpers import AuxTools, db_capped_set_or_get, MongoUtilsError
from mongoUtils.aggregation import Aggregation
from Hellas.Delphi import auto_retry
from Hellas.Sparta import DotDot, EnumLabels


[docs]class MsgState(EnumLabels):
    """An enum used to reflect message state used by classes :class:`Sub` and :class:`PubSub`
    """
    UKNKOWN = 0
    """unknown state"""
    SENT = 1
    """message is unprocessed"""
    RECEIVED = 2
    """message has been picked up for Processing"""
    SUCCES = 3
    """message processed successfully"""
    FAIL = 4
    """message processed but failed or (any int > 10 < 100 is considered as error type)"""


[docs]class Acknowledge(EnumLabels):
    """An enumeration used while requesting acknowledgement to a message used by class :class:`PubSub`."""
    NO = 0
    """requires NO acknowledgement (kind of broadcast message)"""
    RECEIPT = 1
    """acknowledge by setting state after receiving the message"""
    RESULTS = 2
    """acknowledge by setting state and issuing a result message with original as parent"""


[docs]class SubTarget(EnumLabels):
    """An enumeration used when subscribing to messages it is also possible to specify a string expression
    used by class :class:`~pubsub.PubSub`.
    """
    ANY = 1
    """Listen to any targets"""
    NAME = 2
    """Listen only to Messages targeting (by name) this target """
    NAME_OR_ANY = 3
    """combination of 1 or 2 above"""
    NAME_RX = 4
    """Listen to messages targeting names starting with Name (Regex)"""


[docs]class MongoUtilsPubSubError(MongoUtilsError):
    """Base class for all MongoUtils exceptions."""


[docs]class Sub(object):
    """**generic class for subscribing to a collection
    useful for implementing task-message queues and oplog tailing**
    `see here <https://softwaremill.com/mqperf/>`_
    it can also be used with non capped collections except it must use polling instead of a tailing cursor
    until it has at least one document stored
    i.e. to view your local  see: example test_SubToCappedOptLog
    to replay the oplog: set database to 'local' collection to 'oplog.rs'and track_field to 'ts'

    :Parameters:
        - a_collection: (obj) a pymongo collection
        - track_field: (str) optional the name of field to base the query (defaults to None)
              - it must be a first level field i.e. contains no dots
              - it's values must be monotonic increasing
              - it must be an indexed field (especially for large collections)
              - if None the instance will try to get one by examining the documents in collection
        - name:    (str) a name for this instance if not given defaults to db.name|collection.name

    :Raises:
        - MongoUtilsPubSubError if a track_field is not provided and can't be obtained automatically
        - MongoUtilsPubSubError if track field contains dots
    """
[docs]    def __init__(self, a_collection, track_field=None, name=None):
        self._collection = a_collection
        self._capped = self._collection.options().get('capped')
        if track_field is None:
            track_field = self._suggest_track_field()
        else:
            if track_field.find('.') > -1:
                raise MongoUtilsPubSubError('track_field is not first level')
        self._track_field = track_field
        self._name = "{:7s}|{:8s}".format(self._collection.db.name, self._collection.name) if name is None else name
        if track_field is None:
            raise MongoUtilsPubSubError('no track_field')
        self._dt_utc_start = datetime.utcnow()
        self._continue = True

    def _suggest_track_field(self):
        doc_first = self._collection.find_one()
        if doc_first is not None and self._capped is True and 'ts' in doc_first.keys():
            return 'ts'
        if doc_first is not None and isinstance(doc_first['_id'], ObjectId):
            return '_id'
        return None

    @property
    def name(self):
        return self._name

[docs]    def _init_query(self, start_from_last=True):
        """oplog_replay query option needs {'$gte' or '$gt': ts}

        :Parameters:
            - start_from_last [True|False|value] (defaults to True
                - on next inserted document if True
                - from 1st document if False  i.e kind of replay
                - on next document after self._track_field=number if number
        """
        doc = None
        if start_from_last is True:
            doc = self._collection.find_one(sort=[("$natural", -1)])
        elif start_from_last is False:
            doc = self._collection.find_one(sort=[("$natural", 1)])
            # print "doc " * 2, doc
        else:  # then must be a value
            doc = self._collection.find_one({self._track_field: start_from_last})
        track_field_val = doc[self._track_field] if doc is not None else None
        if track_field_val:
            return {self._track_field: {'$gt' if start_from_last is True else '$gte': track_field_val}}
        else:
            return {}

    def _projection_validate(self, projection):
        if projection is None:
            return None
        if isinstance(projection, (list, tuple)):
            projection = {i: 1 for i in projection}
        projection.update({self._track_field: 1})
        return projection

    def _get_cursor(self,  query={}, projection=None, start_from_last=True):
        query.update(self._init_query(start_from_last=start_from_last))
        if self._capped:
            cursor = self._collection.find(query, projection=projection,        # No hint for this type of cursor
                                           cursor_type=CursorType.TAILABLE_AWAIT, oplog_replay=True)
        else:
            cursor = self._collection.find(query, projection=self.projection, sort=[('$natural', -1)])
            cursor.hint([('$natural', -1)])
            if start_from_last is True:
                cursor.skip(cursor.count())
        return cursor

    @auto_retry(AutoReconnect, 6, 0.5, 1)
    def tail(self, query, projection=None, start_from_last=True, sleep_secs=0.001, filter_func=lambda x: x):
        """
        subscribe to a capped collection via a tailing cursor. Method is thread safe.
        :Parameters:
            - query a pymongo filter dictionary to filter results
            - start_from_last [True|False|value] (defaults to True
                - True: on next inserted document
                - False: from 1st document  i.e kind of replay
                - value: on next document after self._track_field=value if any other value
            - sleep_secs: (int or float) seconds to wait on cursor StopIteration
                - a small number (0 - 0.001) makes it more responsive a bigger one (0.01 - 1) more efficient
            - filter_func: a function to filter/modify returned docs (defaults to lambda x: x)
                - if filter function returns None doc is skipped
        """
        projection = self._projection_validate(projection)
        retryOnDeadCursor = True
        retry = True
        if start_from_last is True and self._collection.count() == 0:  # last doen't apply
            start_from_last = False
        while retry and self._continue:
            cursor = self._get_cursor(query, projection, start_from_last)
            # print  ("cursor {} {}".format(cursor.alive, 'X2X2'))
            while cursor.alive and self._continue:
                try:
                    doc = next(cursor)
                    filtered_doc = filter_func(doc)
                    if filtered_doc is not None:
                        yield filtered_doc
                except StopIteration:
                    sleep(sleep_secs)
            # self.start_from_last = False  # @note: since we got here, skip it is meaningless
            if retryOnDeadCursor:
                sleep(1)  # collection is empty or something  print ("retryOnDeadCursor")
            else:
                retry = False
        self.tail_exit(cursor)

[docs]    def poll(self, query={}, projection=None, start_from_last=True, sleep_secs=1, filter_func=lambda x: x, limit=10):
        """
        subscribe by poll in case collection is not capped or when response time is not critical,
        instead of a tailing cursor we can use poll. Method is thread safe.

        :Parameters:
            - limit: (int) number of documents to return in each batch
                - set it to an appropriate number according to use case, a small number (1-10) makes it more responsive
                  a larger value (100 - 1000) makes it more efficient
            - see :meth:`tail` method for other parameters
        """
        projection = self._projection_validate(projection)

        @auto_retry(AutoReconnect, 6, 0.5, 1)
        def next_batch():
            docs = self._collection.find(query, sort=[(self._track_field, 1)],
                                         projection=projection, limit=limit)
            doc_last = None
            for doc in docs:
                doc_last = doc
                filtered_doc = filter_func(doc)
                if filtered_doc is not None:
                    yield filtered_doc
            if doc_last is not None:
                query[self._track_field] = {'$gt': doc_last[self._track_field]}

        query.update(self._init_query(start_from_last))
        # self._continue = True
        while self._continue:
            for d in next_batch():
                yield d
            sleep(sleep_secs)

[docs]    def sub(self, *args, **kwargs):
        """wrapper around poll and tail, it uses tail if collection is capped else poll"""
        return self.tail(*args, **kwargs) if self._capped else self.poll(*args, **kwargs)

[docs]    def tail_exit(self, cursor):
        """called when tail exits useful only for debugging i.e. check cursor state etc"""
        pass

[docs]    def stop(self):
        """stops subscription"""
        self._continue = False

[docs]    def restart(self):
        self._continue = True

    def __repr__(self):
        return '<{}: {}>'.format(self.__class__.__name__, self.name)


[docs]class PubSub(Sub):
    """**generic class for Publishing/Subscribing to a collection**

    `see here <https://softwaremill.com/mqperf/>`_
    it can also be used with non capped collections except it must use polling instead of a tailing cursor

    .. Warning:: In case you use this class to tail the oplog
                    Make sure you DO NOT attempt writing to oplog collection
                    also that you understand potential side effects as described
                    `here <https://www.mongodb.com/blog/post/\
                    pitfalls-and-workarounds-for-tailing-the-oplog-on-a-mongodb-sharded-cluster>`_


    .. Seealso:: more info `here <http://blog.pythonisito.com/2013/04/mongodb-pubsub-with-capped-collections.html>`__
                 and `here <https://github.com/rick446/MongoTools/blob/master/mongotools/pubsub/channel.py>`__

    :Parameters:
        - collection_or_name: (obj or str) a pymongo collection or a string
        - db:      (obj optional) a pymongo db instance only needed if collection_or_name is a string
        - name:    (str) a name for this instance if not given defaults to db.name|collection.name
        - capped:  (bool optional) set to True to get a capped collection
        - reset:   (bool) drops & recreates collection and resets id counters if True
        - size:    (int) capped collection size in bytes
        - max_docs:(int) capped collection max documents count
    """
    _max_name_len = 32
    _reserve_name = " " * _max_name_len  # reserved bytes in a document to ensure it will not grow
    _dt_frmt_info = "{} {:%Y-%m-%d %H:%M:%S %f}"

[docs]    def __init__(self, collection_or_name, db=None, name=None,
                 capped=True, reset=False,
                 size=2 ** 30,  # ~1 GB
                 max_docs=None):
        self._coll_init_specs = {'capped': capped, 'size': size, 'max_docs': max_docs}
        if isinstance(collection_or_name, collection.Collection):
            self._col_name = collection_or_name.name
            self.db = collection_or_name.database
        else:
            self._col_name = collection_or_name
            assert(db is not None)
            self.db = db
        self.aux_tools = AuxTools(db=self.db)
        if reset:
            self.reset()
        a_collection = self._create_collection()
        super(PubSub, self).__init__(a_collection=a_collection, track_field='ts', name=name)
        if len(self._name) > self._max_name_len:
            raise MongoUtilsPubSubError("name can't be greater than {:2d} chars".format(self._max_name_len))
        a_collection.ensure_index("ts", background=True)

[docs]    def reset(self):
        """drops collection and resets sequence generator"""
        self.db.drop_collection(self._col_name)
        self.aux_tools.sequence_reset(self._col_name)
        self._collection = self._create_collection()

    def _create_collection(self):
        specs = self._coll_init_specs
        if specs['capped'] is True:
            a_collection = db_capped_set_or_get(self.db, self._col_name, specs['size'], specs['max_docs'])
        else:
            a_collection = self.db[self._col_name]
        opts = CodecOptions(document_class=SON)
        a_collection = a_collection.with_options(codec_options=opts)
        return a_collection

    def _id_next(self):
        return self.aux_tools.sequence_next(self._col_name)

    def _acknowledge(self, fltr, up):
        return self._collection.find_one_and_update(fltr, up, upsert=False, return_document=ReturnDocument.AFTER)

[docs]    def _acknowledge_received(self, msg):
        """marks doc as received
        we check state to make sure than it was not picked by another client meanwhile
        """
        fltr = {'_id': msg['_id'], 'status.state': MsgState.SENT}
        up = {'$set': {'status.state': MsgState.RECEIVED, 'dt.received': datetime.utcnow(),
                       'status.receivedBy': self._name}}
        return self._acknowledge(fltr, up)

[docs]    def _yield_doc(self, msg):
        """descendants should check the doc and return None if don't want to yield it else the doc to yield"""
        return msg if msg['ackn'] == Acknowledge.NO else self._acknowledge_received(msg)

[docs]    def acknowledge_done(self, msg, state=MsgState.SUCCES):
        fltr = {'_id': msg['_id'], 'status.state': MsgState.RECEIVED, 'status.receivedBy': self._name}
        up = {'$set': {'status.state': state, 'dt.completed': datetime.utcnow()}}
        rt = self._acknowledge(fltr, up)
        if rt is None:
            raise MongoUtilsPubSubError('message not found')
        return rt

    @auto_retry(AutoReconnect, 6, 1, 1)  # todo: check new pymongo errors
    def _insert_msg(self, payload, topic, verb, target, state, ackn, parent=0, sentBy=None):
        """we use SON to ensure order so we can index properly
        """
        if sentBy is None:
            sentBy = self.name
        ts = self._id_next()
        _id = SON([('id', ts), ('parent', parent)])
        address = SON([('topic', topic), ('verb', verb), ('target', target)])
        dt = SON([('sent', datetime.utcnow()), ('received', datetime(1900, 1, 1)), ('completed', datetime(1900, 1, 1))])
        status = SON([('state', state), ('sentBy', sentBy),
                      ('receivedBy', self._reserve_name)])  # reserve space so document will not grow on update
        msg = SON([('_id', _id), ('ts', ts),  ('ackn', ackn), ('address', address),
                   ('status', status), ('dt', dt), ('payload', payload)])
        return self._collection.insert_one(msg)

[docs]    def pub(self, payload, topic='', verb='', target=None, ackn=Acknowledge.RECEIPT, sentBy=None):
        """
        :Parameters:
            - payload: (dict) message body
            - topic: message topic
            - verb: message verb
            - target: str or None specifies target(s) name
            - ackn: Request acknowledge see: :class: Acknowledge class
            - sendBy: str or None identifies sender (if None defaults to instance name)
        """
        return self._insert_msg(payload, topic, verb, target, state=MsgState.SENT, ackn=ackn, sentBy=sentBy)

    def _query(self, topic=None, verb=None, target=True, state=MsgState.SENT):
        def update_son(key, val):
            if val is not None and val != '':
                qson.update({key: val})

        if target == SubTarget.ANY:
            target = None
        elif target == SubTarget.NAME:
            target = self._name
        elif target == SubTarget.NAME_OR_ANY:
            target = {'$or': [self._name, None, '']}
        elif target == SubTarget.NAME_RX:
            target = {'$regex': '^' + self._name + '.'}
        qson = SON()
        update_son('address.topic', topic)
        update_son('address.verb', verb)
        update_son('address.target', target)
        update_son('status.state', state)
        return qson

[docs]    def tail(self, topic=None, verb=None, target=SubTarget.NAME,
             projection=None, start_from_last=True, sleep_secs=0.01):
        """subscribe by tail

        :Parameters:
            - topic: any arbitrary value specifying a topic or None
            - topic: any arbitrary value specifying a verb or None
            - target: a value specifying target listener or None
              see  :class:`~pubsub.SubTarget` class
            - projection: a pymongo projection specifying which fields to return or None
            - start_from_last: see :meth:`Sub.tail` method
            - delay_secs  see :meth:`Sub.tail` method
        """
        query = self._query(topic, verb, target, None)
        return super(PubSub, self).tail(query, projection=projection, start_from_last=start_from_last,
                                        sleep_secs=sleep_secs, filter_func=self._yield_doc)

[docs]    def poll(self, topic=None, verb=None, target=SubTarget.NAME,
             projection=None, start_from_last=True, sleep_secs=1, limit=10):
        """subscribe by poll

        :Parameters: see methods :meth:`Sub.poll`  and :meth:`PubSub.tail`
        """

[docs]    def _tail_adhoc(self, *args, **kwargs):
        """bypass protocol and tails as defined by parent - used for testing"""
        return super(PubSub, self).tail(*args, **kwargs)

[docs]    def _poll_adhoc(self, *args, **kwargs):
        """bypass protocol and poll as defined by parent - used for testing"""
        return super(PubSub, self).poll(*args, **kwargs)

    @classmethod
[docs]    def msg_info(cls, msg):
        """returns dictionary with human readable info about a message"""
        res = DotDot(msg.copy())
        secs = DotDot()
        secs.receive = (res.dt.received - res.dt.sent).total_seconds() if res.status.state > MsgState.SENT else -1
        secs.done = (res.dt.completed - res.dt.received).total_seconds() if res.status.state > MsgState.RECEIVED else -1
        res['seconds'] = secs
        for k, v in list(msg['dt'].items()):
            res['dt'][k] = cls._dt_frmt_info.format(k, v)
        # print "check 123 ", (res.status.state, MsgState.RECEIVED)

        res['status']['receivedBy'] = MsgState.value_name(res.status.receivedBy.strip())
        res['status']['state'] = MsgState.value_name(res.status.state)

        res.ackn = Acknowledge.value_name(res.ackn)
        res['address']['target'] = SubTarget.value_name(res.address.target)
        return res


[docs]class PubSubStats(object):
[docs]    def __init__(self, collection):
        self.collection = collection
        self.cache = {}

    def _aggr(self):
        return Aggregation(self.collection)

[docs]    def job_status(self, name=None, match=None, fields_list=['address.topic', 'status.state']):
        if name is None:
            name = 'job_status'
        res = self.cache.get(name)
        if res is not None:
            return res
        aggr = self._aggr()
        if match is not None:
            aggr.match(match)
        aggr.group({'_id': aggr.construct_fields(fields_list), 'count': {'$sum': 1}})
        aggr.sort({'_id.status_state': 1})
        self.cache[name] = aggr
        return aggr

[docs]    def responce_stats(self, name=None, match={}, group=None):
        if name is None:
            name = 'responce_stats'
        res = self.cache.get(name)
        if res is not None:
            return res
        aggr = Aggregation(self.collection)
        match.update({'status.state': {'$gt': MsgState.SENT}})
        aggr.match(match)
        aggr.project({'_id': '$_id', 'state': '$status.state', 'rMillis': {'$subtract': ["$dt.received", "$dt.sent"]}})
        aggr.group(aggr.construct_stats(['rMillis']))
        self.cache[name] = aggr
        return aggr

[docs]    def mesgs_persec(self, query={'status.state': {'$gt': MsgState.SENT}}):
        res = DotDot({'msgs': -1, 'seconds': -1, 'msgsPerSec': -1})
        msg_first = self.collection.find_one(query, sort=[("$natural", 1)])
        if msg_first is not None:
            msgs = self.collection.find(query, sort=[("$natural", -1)])
            if msg_first['_id'] != msgs[0]['_id']:
                res.msgs = msgs.count()
                res.seconds = (msgs[0]['dt']['received'] - msg_first['dt']['received']).total_seconds()
                res.msgsPerSec = 0 if res.seconds == 0 else res.msgs / res.seconds
        return res