Source code for mongoUtils.helpers

"""some helper functions and classes"""

import logging
from datetime import datetime
from Hellas.Sparta import DotDot
from bson import json_util, SON
from mongoUtils import _PATH_TO_JS
from pymongo.read_preferences import ReadPreference
from pymongo.collection import Collection
from pymongo.database import Database
from pymongo import ReturnDocument
from pymongo.bulk import BulkOperationBuilder

LOG = logging.getLogger(__name__)
LOG.debug("loading module: " + __name__)


[docs]class MongoUtilsError(Exception):
    """Base class for all MongoUtils exceptions."""


[docs]class CollectionExists(MongoUtilsError):
[docs]    def __init__(self, collection_name=''):
        super(MongoUtilsError, self).__init__("Collection Exists " + collection_name)


"""operations on a collection object"""


[docs]def col_stats(collection_obj, indexDetails=True, scale=2 ** 10):
    """collection statistics scale default in MegaBytes, give it 2 ** 30 for GigaBytes
    """
    return DotDot(collection_obj.database.command("collstats", collection_obj.name,
                                                  indexDetails=indexDetails, scale=scale))


[docs]def coll_index_names(coll_obj):
    return [i['key'][0][0] for i in list(coll_obj.index_information().values())]


[docs]def coll_validate(coll_obj, scandata=False, full=False):
    """`see validate <http://docs.mongodb.org/manual/reference/command/validate/#dbcmd.validate>`_"""
    return coll_obj.database.validate_collection(coll_obj.name, scandata=scandata, full=full)


[docs]def coll_range(coll_obj, field_name="_id"):
    """returns (minimum, maximum) value of a field

    :Parameters:
        - coll_obj: a pymongo collection object
        - field_name: (str) name of field (defaults to _id)
    :Example:
        >>> coll_range(db.muTest_tweets_users, 'id_str')
        (u'1004509039', u'999314042')
    """
    projection = {} if field_name == '_id' else {'_id': 0, field_name: 1}  # make sure we get just ONE field
    idMin = coll_obj.find_one(sort=[(field_name, 1)], projection=projection)
    if idMin:
        idMin = list(idMin.values())[0]
        idMax = coll_obj.find_one(sort=[(field_name, -1)], projection=projection)
        idMax = list(idMax.values())[0]
        return idMin, idMax
    else:
        return None, None


[docs]def coll_chunks(collection, field_name="_id", chunk_size=100000):
    """Provides an iterator with range query arguments for scanning a collection in batches equals to chunk_size
    for optimization reasons first chunk size is chunk_size +1
    similar to undocumented mongoDB splitVector command try it in mongo console:
    ``db.runCommand({splitVector: "mongoUtilsTests.muTest_tweets", keyPattern: {_id: 1}, maxChunkSizeBytes: 1000000})``
    seems our implementation is faster than splitVector when tried on a collection of ~300 million documents

    :Parameters:
        - collection: (obj) a pymongo collection instance
        - field_name: (str) the collection field to use, defaults to _id, all documents in
           this field must be indexed otherwise operation will be slow,
           also collection must have a value for this field
        -  chunk_size: (int or float)  (defaults to 100000)
            - if int requested  number of documents in each chunk
            - if float (< 1.0) percent of total documents in collection i.e if 0.2 means 20%
    :Returns:
        - an iterator with a tuple (chunk number, query specification dictionary for each chunk)

    :Usage:
        >>> coll_chunks(db.muTest_tweets, 'id_str', 400)
        >>> for i in rt: print i
        (0, {'id_str': {'$lte': u'523829721985851392', '$gte': u'523829696790663168'}})
        (1, {'id_str': {'$lte': u'523829751329611777', '$gt': u'523829721985851392'}})
        (2, {'id_str': {'$lte': u'523829763937681408', '$gt': u'523829751329611777'}})

    """
    projection = {} if field_name == '_id' else {'_id': 0, field_name: 1}  # make sure we get just ONE field
    if isinstance(chunk_size, float) and chunk_size < 1:
        chunk_size = int(collection.count() * chunk_size)
    idMin, idMax = coll_range(collection, field_name)
    curMin = idMin
    curMax = idMax
    cntChunk = 0
    while cntChunk == 0 or curMax < idMax:
        nextChunk = collection.find_one({field_name: {"$gte": curMin}}, sort=[(field_name, 1)],
                                        skip=chunk_size, projection=projection)
        curMax = list(nextChunk.values())[0] if nextChunk else idMax
        query = {field_name: {"$gte" if cntChunk == 0 else "$gt": curMin, "$lte": curMax}}
        yield cntChunk, query
        cntChunk += 1
        curMin = curMax


[docs]def coll_update_id(coll_obj, doc, new_id):
    """updates a document's id by inserting a new doc then removing old one

    .. Warning:: | Very dangerous if you don't know what you are doing, use it at your own risk.
                 | Never use it in production
                 | Also be careful on swallow copies

    :Parameters:
        - coll_obj: a pymongo collection
        - doc: document_to rewrite with a new_id
        - new_id: value of new id
    :Returns:  a tuple
        - tuple[0]: True if operation was successful otherwise False
        - tuple[1]: Exception if unsuccessful or delete results if success
        - tuple[2]: Insert results if successful
    """
    docNew = doc.copy()
    docNew['_id'] = new_id
    try:
        rt = coll_obj.insert_one(docNew)
    except Exception as e:
        return False, Exception, e  # @note: on error return before removing original !!!!
    else:
        if rt.inserted_id == new_id:
            return True, coll_obj.find_one_and_delete({"_id": doc['_id']}), rt
        else:
            return False, False, False


[docs]def coll_copy(collObjFrom, collObjTarget, filter_dict={},
              create_indexes=False, dropTarget=False, write_options={}, verbose=10):
    """copies a collection using unordered bulk inserts
    similar to `copyTo <http://docs.mongodb.org/manual/reference/method/db.collection.copyTo/>`_ that is now deprecated

    :Parameters:
        - collObjFrom:
        - collObjTarget: destination collection
        - filter_dict: a pymongo query dictionary to specify which documents to copy (defaults to {})
        - create_indexes: creates same indexes on destination collection if True
        - dropTarget: drop target collection before copy if True (other wise appends to it)
        - write_options: operation options (use {'w': 0} for none critical copies
        - verbose: if > 0 prints progress statistics at verbose percent intervals
    """
    frmt_stats = "copying {:6.2f}% done  documents={:20,d} of {:20,d}"
    if verbose > 0:
        print("copy_collection:{} to {}".format(collObjFrom.name, collObjTarget.name))
    if dropTarget:
        collObjTarget.drop()
    docs = collObjFrom.find(filter_dict)
    totalRecords = collObjFrom.count() if filter_dict == {} else docs.count()
    if verbose > 0:
        print("totalRecords", totalRecords)
    perc_done_last = -1
    bulk = muBulkOps(collObjTarget, ordered=False, ae_n=1000, dwc={'w': "majority"})
    cnt = 0
    for doc in docs:
        cnt += 1
        if verbose > 0:
            perc_done = round((cnt + 1.0) / totalRecords, 3) * 100
            if perc_done != perc_done_last and perc_done % verbose == 0:
                print(frmt_stats.format(perc_done, cnt, totalRecords))
                perc_done_last = perc_done
        bulk.insert(doc)
    bulk.execute_if_pending()
    if create_indexes:
        for k, v in list(collObjFrom.index_information().items()):
            if k != "_id_":
                idx = (v['key'][0])
                if verbose > 0:
                    print("creating index {:s}".format(k))
                collObjTarget.create_index([idx], backgound=True)
    return collObjTarget


[docs]def db_capped_create(db, coll_name, sizeBytes=10000000, maxDocs=None, autoIndexId=True):
    """create a capped collection

    :Parameters:
        - `see here <http://api.mongodb.org/python/current/api/pymongo/database.html>`_
        - `and here <http://docs.mongodb.org/manual/reference/method/db.createCollection/>`_
    """
    if coll_name not in db.collection_names():
        return db.create_collection(coll_name, capped=True, size=sizeBytes,
                                    max=maxDocs, autoIndexId=autoIndexId)
    else:
        raise CollectionExists(coll_name)


[docs]def db_convert_to_capped(db, coll_name, sizeBytes=2 ** 30):
    """converts a collection to capped"""
    if coll_name in db.collection_names():
        return db.command({"convertToCapped": coll_name, 'size': sizeBytes})


[docs]def db_capped_set_or_get(db, coll_name, sizeBytes=2 ** 30, maxDocs=None, autoIndexId=True):
    """sets or converts a collection to capped
    `see more here <http://docs.mongodb.org/manual/tutorial/use-capped-collections-for-fast-writes-and-reads>`_
    autoIndexId must be True for replication so must be True except on a stand alone mongodb or
    when collection belongs to local db
    """
    if coll_name not in db.collection_names():
        return db.create_collection(coll_name, capped=True, size=sizeBytes,
                                    max=maxDocs, autoIndexId=autoIndexId)
    else:
        capped_coll = db[coll_name]
        if not capped_coll.options().get('capped'):
            db.command({"convertToCapped": coll_name, 'size': sizeBytes})
        return capped_coll


[docs]def client_schema(client, details=1, verbose=True):
    """returns and optionally prints a mongo schema containing databases and collections in use

    :Parameters:
        - client: a pymongo client instance
        - details: (int) level of details to print/return
        - verbose: (bool) if True prints results
    """
    def col_details(col):
        res = col.name if details == 0 else {'name': col.name}
        if details > 1:
            res['stats'] = col_stats(col)
        return res
    rt = [[d, [col_details(client[d][c])
               for c in client[d].collection_names()]] for d in client.database_names()]
    rt = dict(rt)
    if verbose:
        pp_doc(rt)
    return rt


[docs]class muBulkOps(object):
    """ a wrapper around BulkOperationBuilder provides for some automation

    .. versionadded:: 1.0.6

    :parameters:
        - ae_n: (int) auto execute every n operations (defaults to 0 to refrain from auto execution)
        - ae_s: (int) auto execute seconds since start or last execute before a new execute is automatically initiated
          useful when we want to ensure that collection data are relative fresh
          set it to 0 (default to disable auto execute b
        - dwc: (dict) or None default write concern to use in case of autoexecute_every
          DO NOT pass a WriteConcern object just a plain dict i.e {'w':1}
    """
    frmt_stats = "{:s}db:{:s} collection:{:s} cnt_operations_executed:{:16,d} cnt_operations_pending:{:6,d}"

[docs]    def __init__(self, collection, ordered=True, ae_n=0, ae_s=0, dwc=None):
        """Initialize a new BulkOperationBuilder instance."""
        self.collection = collection
        self.ordered = ordered
        self.ae_n = ae_n
        self.dwc = dwc
        self.cnt_operations_pending = 0
        self.cnt_operations_executed = 0
        self.ae_n = ae_n
        self.ae_s = ae_s
        if ae_s != 0:
            self.dt_last = datetime.now()
        self._init_builder()

    def _init_builder(self):
        self._bob = BulkOperationBuilder(collection=self.collection, ordered=self.ordered)

[docs]    def find(self, selector):
        return self._bob.find(selector)

#     def append(self, document):
#         return self.insert(document)
[docs]    def stats(self, message=''):
        return self.frmt_stats.format(message, self.collection.database.name, self.collection.name,
                                      self.cnt_operations_executed, self.cnt_operations_pending)

[docs]    def stats_print(self):
        print(self.stats())

[docs]    def insert(self, document):
        rt = self._bob.insert(document)
        self.cnt_operations_pending += 1
        # LOG.critical(self.stats("inserts"))
        if self.ae_s != 0:
            current_dt = datetime.now()
            if self.cnt_operations_pending == self.ae_n or ((current_dt - self.dt_last).seconds > self.ae_s):
                self.dt_last = current_dt
                rt = self.execute(write_concern=self.dwc, recreate=True)
        elif self.cnt_operations_pending == self.ae_n:
            rt = self.execute(write_concern=self.dwc, recreate=True)
        return rt

[docs]    def execute(self, write_concern=None, recreate=True):
        rt = self._bob.execute(write_concern=write_concern)
        self.cnt_operations_executed += self.cnt_operations_pending
        self.cnt_operations_pending = 0
        if recreate:
            self._init_builder()
        return rt

[docs]    def execute_if_pending(self, write_concern=None):
        """executes if any pending operations still exist call it on error or something"""
        if write_concern is None:
            write_concern = self.dwc
        if self.cnt_operations_pending > 0:
            rt = self.execute(write_concern=self.dwc, recreate=True)
            return rt
# Collection.parallel_scan(self, num_cursors)


[docs]class muCollection(Collection):
    """just a plain pymongo collection with some extra features
    it is safe to cast an existing pymongo collection to this by:

        >>> a_pymongo_collection_instance.__class__ = muCollection
    """
[docs]    def stats(self, indexDetails=True, scale=2 ** 10):
        """collection statistics (see :func:`col_stats`)"""
        return col_stats(self, indexDetails, scale)

[docs]    def index_names(self):
        """see :func:`coll_index_names`"""
        return coll_index_names(self)

[docs]    def validate(self, scandata=False, full=False):
        """see :func:`coll_validate`"""
        return coll_validate(self, scandata=scandata, full=full)


[docs]class muDatabase(Database):
    """just a plain pymongo Database with some extra features
    it is safe to cast an existing pymongo database to this by:

        >>> a_pymongo_database_instance.__class__ = muDatabase
    """
[docs]    def dbstats(self, scale=10 ** 30):
        """:Returns: database statistics"""
        return DotDot(self.command("dbStats", scale=scale))

[docs]    def collstats(self, details=2, verbose=True):
        """:Returns: database collections statistics"""
        def coll_details(col):
            res = col.name if details == 0 else {'namespace': self.name + '.' + col.name}
            if details > 1:
                res['stats'] = col.collstats()
            return res
        rt = DotDot([[c, coll_details(self[c])] for c in self.collection_names()])
        pp_doc(rt, sort_keys=True, verbose=verbose)
        return rt

[docs]    def server_status(self, verbose=True):
        """:Returns: server status"""
        rt = DotDot(self.command("serverStatus"))
        pp_doc(rt, sort_keys=True, verbose=verbose)
        return rt

[docs]    def capped_create(self, colName, sizeBytes=10000000, maxDocs=None, autoIndexId=True):
        return db_capped_create(self, colName, sizeBytes, maxDocs, autoIndexId)

[docs]    def convert_to_capped(self, colName, sizeBytes=2 ** 30):
        return db_convert_to_capped(self, colName, sizeBytes)

[docs]    def capped_set_or_get(self, colName, sizeBytes=2 ** 30, maxDocs=None, autoIndexId=True):
        """see :func:`db_capped_set_or_get`. """
        return db_capped_set_or_get(self, colName, sizeBytes, maxDocs, autoIndexId)

[docs]    def colections_startingwith(self, startingwith=[]):
        """returns names of all collections starting with specified strings

        :parameter: list startingwith: starting with prefixes i.e.  ["tmp\_", "del\_"]
        """
        return [c for c in self.collection_names() if any([c.startswith(i) for i in startingwith])]

[docs]    def drop_collections_startingwith(self, startingwith=[]):
        """drops all collections names starting with specified strings

        :parameter: list startingwith: starting with prefixes i.e.  ["tmp\_", "del\_"]
        """
        colsToRemove = self.colections_startingwith(startingwith)
        for c in colsToRemove:
            self.drop_collection(c)
        return colsToRemove

[docs]    def js_fun_add(self, fun_name, fun_str):
        """adds a js function to database 
        to use the function from mongo shell you have to execute db.loadServerScripts(); first

        :Parameters:
            - fun_name (string): a name for this function
            - fun_str  (string): js function string
        """
        self.system_js[fun_name] = fun_str
        return fun_name

[docs]    def js_fun_add_default(self, file_name, fun_name):
        return self.js_fun_add(fun_name, parse_js_default(file_name, fun_name))

[docs]    def js_list(self):
        """:Returns: all user js functions installed on server"""
        return self.system_js.list()

    def __getitem__(self, name):
        return muCollection(self, name)


[docs]def pp_doc(doc, indent=4, sort_keys=False, verbose=True):
    """pretty print a boson document"""
    rt = json_util.dumps(doc, sort_keys=sort_keys, indent=indent, separators=(',', ': '))
    if verbose:
        print(rt)
    return rt


[docs]class AuxTools(object):
    """**a collection to support generation of sequence numbers using counter's collection technique**

    .. Note:: counter's collection guarantees a unique incremental id value even in a multiprocess/mutithreading environment
              but if this id is used for insertions. Insertion order is not 100% guaranteed to correspond to this id.
              If insertion order is critical use the Optimistic Loop technique

    .. Seealso:: `counters collection <http://docs.mongodb.org/manual/tutorial/
        create-an-auto-incrementing-field/#auto-increment-counters-collection>`__
        and `Copeland's snippet <https://github.com/rick446/MongoTools/blob/master/mongotools/pubsub/channel.py>`__

    :Parameters:
        - collection: (obj optional) a pymongo collection object
        - db: (obj optional) a pymongo database object
        - client:  (obj optional) a pymongo MongoClient instance
            - all parameters are optional but exactly one must be provided
            - if collection is None a collection db[AuxCol] will be used
            - if db is None a collection on db ['AuxTools']['AuxCol'] will be used
    """
[docs]    def __init__(self, collection=None, db=None, client=None):
        if collection is None:
            collection = db['AuxCol'] if db is not None else client['AuxTools']['AuxCol']
        self.collection = collection.with_options(read_preference=ReadPreference.PRIMARY)
        # @note make sure sequence_current gets correct value

[docs]    def sequence_reset(self, seq_name):
        """resets sequence"""
        self.collection.remove({'_id': seq_name})

[docs]    def sequence_set(self, seq_name, val=1):
        """sets sequence's current value to val if doesn't exist it is created"""
        return self.collection.find_one_and_update({'_id': seq_name}, {'$set': {'val': val}},
                                                   upsert=True, return_document=ReturnDocument.AFTER)['val']

[docs]    def sequence_current(self, seq_name):
        """returns sequence's current value for particular name"""
        doc = self.collection.find_one({'_id': seq_name})
        return 0 if doc is None else doc['val']

[docs]    def sequence_next(self, seq_name, inc=1):
        """increments sequence's current value by incr, if doesn't exist sets initial value to incr"""
        return self.collection.find_one_and_update({'_id': seq_name}, {'$inc': {'val': inc}},
                                                   upsert=True, return_document=ReturnDocument.AFTER)['val']


[docs]class SONDot(SON):
    """
    A SON class that can handle dot notation to access its members (useful when parsing JSON content)

    :Example:
        >>> son = SONDot([('foo', 'bar'), ('son2', SON([('son2foo', 'son2Bar')]))])
        >>> son.son2.son2foo
        son2Bar

    .. Warning:: don't use dot notation for write operations  i.e son.foo = 'bar' **(it will fail silently !)**
    """
    def __getattr__(self, attr):
        try:
            item = self[attr]
        except KeyError as e:
            raise AttributeError(e)    # expected Error by pickle on __getstate__ etc
        if isinstance(item, dict) and not isinstance(item, DotDot):
            item = SONDot(item)
        return item


[docs]def parse_js(file_path, function_name, replace_vars=None):
    """
    | helper function to get a js function string from a file containing js functions
    | useful if we want to call js functions from python as in mongoDB map reduce.
    | Function must be named starting in first column and end with '}' in first column
      (see relevant functions in js directory)

    :Parameters:
        - file_path: (str) full path_name
        - function name: (str) name of function
        - replace_vars: (optional) a tuple to replace %s variables in functions
    :Returns: a js function as string
    """
    rt = ''
    start = 'function {}'.format(function_name)
    with open(file_path, 'r') as fin:
        ln = fin.readline()
        while ln:
            if ln.startswith(start) or rt:
                rt += ln
            if rt and ln.startswith('}'):
                break
            ln = fin.readline()
    if rt and replace_vars:
        return rt % replace_vars
    else:
        return rt


[docs]def parse_js_default(file_name, function_name, replace_vars=None):
    """fetch a js function on default directory from file_name (see :func:`parse_js`)"""
    return parse_js(_PATH_TO_JS+file_name, function_name, replace_vars)


[docs]def geo_near_point_q(geo_field, Long_Lat, query={}, minDistance=None, maxDistance=None):
    """geo near point query constructor

    :Parameters:
        - geo_field: (str) name of geo indexed field (i.e. location)
        - Long_Lat: (tuple or list) [longitude, latitude]
        - query: (dict) an other query specifications to be combined with geo query (defaults to {})
        - minDistance: minimum distance in meters (defaults to None)
        - maxDistance: up to distance in meters (defaults to None)

    :Returns: query dictionary updated with geo specs
    """
    gq = {geo_field: {'$near': {'$geometry': {'type': 'Point', 'coordinates': Long_Lat}}}}
    if minDistance is not None:
        gq[geo_field]['$near']['$minDistance'] = minDistance
    if maxDistance is not None:
        gq[geo_field]['$near']['$maxDistance'] = maxDistance
    query.update(gq)
    return query