Source code for batchcreate.batchcreator

import sys


[docs]class BatchCreator: """ Batchcreator takes an array of records as input and splits it into suitably sized batches of records which can be further processed or passed to any other system/s. Import BatchCreator iterator class and instantiate it. You can use below parameters to define output batch limits. These parameters are optional. If neither of these parameters is specified then the default value will be used. Example:: from batchcreate import BatchCreator batches = BatchCreator(records, max_record_size=60, max_batch_size=200, max_batch_num_records=4) The iterable BatchCreator object can give suitable batches as needed on iteration. The BatchCreator object can be used in a regular 'for' loop. Example:: for batch in batches: print(batch) #batch processing here OR Example:: batchItr = iter(batches) print(next(batchItr)) #batch processing here Attributes: records : [] Input list of records to split into batches. max_record_size : int, default 1MB The maximum size limit for a record in the output batch. Any record with larger size than this will be skipped from batching. max_batch_size : int, default 5MB The maximum size limit for a batch. max_batch_num_records : int, default 500 The maximum number of records limit for a batch. BatchCreator will put maximum these many records per batch provided batch size satisfies the limit. Methods: batches : Returns the list of all the batches. """ def __init__( self, records, max_record_size=1000000, max_batch_size=5000000, max_batch_num_records=500 ): """ Instantiates BatchCreator. If the arguments max_record_size, max_batch_size and max_batch_num_records are not passed then uses the default values. :param records (list): Input list of records to split into batches. :param max_record_size (int, optional): The maximum size limit for a record if other than default. :param max_batch_size (int, optional): The maximum size limit for a batch if other than default. :param max_batch_num_records (int, optional): The maximum number of records limit for a batch if other than default. """ self.records = records self.max_record_size = max_record_size self.max_batch_size = max_batch_size self.max_batch_num_records = max_batch_num_records def _get_batch_size(self): """ Gets the size of a batch. :return: The size of a batch. """ return sys.getsizeof(self.batch) def _get_batch_num_records(self): """ Gets the number of records of a batch. :return: The number of records of a batch. """ return len(self.batch) def _get_record_size(self, record): """ Gets the size of a given record. :param record: A record from input list of records. :return: The size of the record. """ return sys.getsizeof(record) def _validate_batch_size(self, record): """ Checks if the current batch size will be less than the max_batch_size limit upon adding the new record. Returns True if less than max_batch_size else False. :param record: A record from input list of records. :return: Returns True if less than max_batch_size else False. """ new_batch_size = (self._get_batch_size() + self._get_record_size(record)) return new_batch_size < self.max_batch_size def _validate_batch_num_records(self): """ Checks if the number of records in the batch are less than the max_batch_num_records limit. :return: Returns True if less than max_batch_num_records else False. """ return self._get_batch_num_records() < self.max_batch_num_records def _validate_record_size(self, record): """ Checks if the size of the record is less than the max_record_size limit. :param record: A record from input list of records. :return: Returns True if less than max_record_size else False. """ return self._get_record_size(record) < self.max_record_size def __iter__(self): """ Creates BatchCreator iterator object. Creates generator iter_records from input records list and an empty batch to save the valid records. :return: BatchCreator iterator object. """ self.iter_records = (r for r in self.records) self.batch = [] return self def __next__(self): """ If the records generator exists then traverse through it. Check if the record is of valid size. If the record is not of valid size then skip it. If the record is of valid size then check if the current batch is of valid size and has less records than the max_batch_num_records limit. If yes then add the record to the current batch else create a new batch having this record. :return: Batch of record/s. """ batch = self.batch if not self.iter_records: raise StopIteration for record in self.iter_records: if self._validate_record_size(record): if self._validate_batch_num_records() and self._validate_batch_size(record): batch.append(record) else: self.batch = [record] return batch self.iter_records = None return batch
[docs] def batches(self): """ Creates a list of batches by iterating over BatchCreator iterator. :return: List of batches. Example:: batches = BatchCreator(records).batches() """ return list(iter(self))