Source code for mongoUtils.schema

"""**Schema Analyzer Utilities for mongoDB collection based on map reduce**"""


from mongoUtils.helpers import parse_js_default
from mongoUtils.mapreduce import mr
from Hellas.Thebes import format_header


[docs]def schema(collection, query={}, out = {'replace': 'tmp_mrFields'}, meta=False, scope={'parms': {'levelMax': -1, 'inclHeaderKeys': False}}, verbose=2 ): """discovers all field's names used by a a collection's documents for a different approach `see here <https://github.com/variety/variety>`_ also mongoDB will introduce a similar tool fields of the form xxx.floatApprox xxx.bottom', xxx.top are internal mongoDB field for storing long integers outputs to local db so results don't get replicated :Parameters: - collection: a mongoDB collection - query: a pymongo query dictionary to filter documents that will be searched to a subset of a collection (useful for large collections) - out: map reduce output specificatins dictionary (see see :func:`~.mr` function except for it can't be inline - meta: if True results are passed to :func:`schema_meta` function for analysis - scope: a dictionary {'parms': {'levelMax': -1, 'inclHeaderKeys': False}} - levelMax: (int) max level for keys if -1 any level (defaults to -1) - inclHeaderKeys: (bool) if True includeds top level keys - verbose: (int) if > 0 prints progress and output :Example: >>> from pymongo import MongoClient;from mongoUtils.configuration import testDbConStr # import MongoClient >>> db = MongoClient(testDbConStr).get_default_database() # get test database r = schema(db.muTest_tweets, meta=True, verbose=1) # check fields ........................................................................................................ | field | cnt |percent|depth| notes | ........................................................................................................ | _id | 1,000| 100.00| 1| | | contributors | 1,000| 100.00| 1| | | coordinates | 1,000| 100.00| 1| | | coordinates.coordinates | 18| 1.80| 2| | | coordinates.type | 18| 1.80| 2| | | created_at | 1,000| 100.00| 1| | | entities | 1,000| 100.00| 1| | | entities.hashtags | 1,000| 100.00| 2| | | entities.media | 196| 19.60| 2| | | entities.symbols | 1,000| 100.00| 2| | | entities.trends | 1,000| 100.00| 2| | | entities.urls | 1,000| 100.00| 2| | | entities.user_mentions | 1,000| 100.00| 2| | | extended_entities | 196| 19.60| 1| | | extended_entities.media | 196| 19.60| 2| | | favorite_count | 1,000| 100.00| 1| | | favorited | 1,000| 100.00| 1| | | filter_level | 1,000| 100.00| 1| | | geo | 1,000| 100.00| 1| | | geo.coordinates | 18| 1.80| 2| | | geo.type | 18| 1.80| 2| | | id | 1,000| 100.00| 1| | | id.bottom | 1,000| 100.00| 2|-hidden mongo field | | id.floatApprox | 1,000| 100.00| 2|-hidden mongo field | | ......etc....... | .....| ......| .| | ........................................................................................................ >>> for i in r[1]: print i: # print results {u'_id': u'', u'value': {'notes': '', 'field': u'', u'cnt': 31, u'percent': 3.1000000000000005, u'depth': 1}} etc. etc... """ # out = {'db': 'local', 'replace': output_coll_name} map_js = parse_js_default('MapReduce.js', 'KeysMap') reduce_js = parse_js_default('MapReduce.js', 'KeysReduce') if verbose > 0: print ("discovering fields ...") rt = mr( collection, map_js, reduce_js, query=query, out=out, scope=scope, verbose=0) results_col, mr_stats = rt totalRecords = float(mr_stats['counts']['input']) totalCnt = 0 if verbose > 0: print ("calculating percentages ...") for doc in results_col.find(): cnt = doc['value']['cnt'] percent = (cnt / totalRecords) * 100 doc['value']['percent'] = percent results_col.update_one( {'_id': doc['_id']}, {"$set": {"value.percent": percent}} ) # @warning: don't use {_id:id} does not work possibly coz different subfields order # print results_col.find_one({'_id':doc['_id']}, safe=True) totalCnt += cnt if verbose > 0: print ("creating indexes") rt[0].create_index('_id.type', background=True) if meta: rtMeta = schema_meta(rt, verbose=verbose) return rt, rtMeta else: return rt
[docs]def schema_meta(mr_keys_results, verbose=2): """given the results returned by :func:`schema` function calculates and returns statistics for schema fields also pretty prints stats if verbose > 0 Be aware of hidden mongoDB fields that mongoDB uses internally :Parameters: - mr_keys_results: results tuples returned be schema - verbose 0 | 1 :Returns: list of statistics for each field """ map_js = parse_js_default('MapReduce.js', 'KeysMetaMap') reduce_js = parse_js_default('MapReduce.js', 'KeysMetaReduce') hidden_fields = ['floatApprox', 'top', 'bottom'] # internal mongo fields res = mr( mr_keys_results[0], map_js, reduce_js, out={'inline': 1}, verbose=0) info = {} for i in res[0]: i['value']['cnt'] = int(i['value']['cnt']) i['value']['depth'] = int(i['value']['depth']) i['value']['notes'] = "" i['value']['field'] = i['_id'] if i['value']['depth'] > 1: if i['_id'].split('.')[-1] in hidden_fields or i['_id'] == '_id.str': i['value']['notes'] += "-hidden mongo field" l = sorted(res[0], key=lambda x: x['value']['depth']) info['max depth'] = l[-1]['value']['depth'] info['total fields'] = len(l) l = sorted(res[0], key=lambda x: x['_id']) if verbose > 0: frmt = "|{field:^70s}|{cnt:16,d}|{percent:7.2f}|{depth:5d}|{notes:^20s}|" header = format_header(frmt) print(header) for i in l: print(frmt.format(**i['value'])) print(header.split('\n')[0]) print (info) return l
[docs]def schema_exclude_parents(fields_list, as_string=True): """useful for producing fields parameter for mongoexport :Parameters: - fields_list: a fields_list as produced by :func:`schema` function - as_string: True or False, converts output to string if True (default) :Returns: - last level elements of fields_list :Example: >>> res, stats = sch.schema(db.muTest_tweets_users,verbose=0) >>> res['value']['fields'] ['_id', '_id.str', 'contributors_enabled', 'created_at', 'default_profile' .... ] >>> schema_exclude_parents(res['value']['fields']) '_id.str,contributors_enabled,created_at,default_profile,default_profile_image ... """ def is_parent(item): return len([i for i in fields_list if i.startswith(item+'.')]) > 0 rt = [i for i in fields_list if not is_parent(i)] return ",".join(rt) if as_string is True else rt
[docs]def mongoexport_fields(file_path, collection, query={}, excl_fields_lst=[]): """exports all field names except excl_fields_lst to a file :Parameters: - file_path: (str) path to output file - collection: a pymongo collection object - query: a pymongo query dictionary (optional) to restrict fields discovery to a subset of a collection (useful for large collections) - excl_fields_lst: (list) field names to exclude from output :Example: >>> mongoexport_fields("/path_to_file", db.muTest_tweets_users, excl_fields_lst=['_id']) """ r = schema(collection, query=query, verbose=0) r = schema_exclude_parents(r[0].find()[0]['value']['fields'], as_string=False) with open(file_path, "w") as fout: for i in r: fout.write(i + "\n") return r