diff options
Diffstat (limited to 'activerecord/lib/active_record/relation/batches.rb')
-rw-r--r-- | activerecord/lib/active_record/relation/batches.rb | 145 |
1 files changed, 97 insertions, 48 deletions
diff --git a/activerecord/lib/active_record/relation/batches.rb b/activerecord/lib/active_record/relation/batches.rb index de005e2810..4b2987ac6d 100644 --- a/activerecord/lib/active_record/relation/batches.rb +++ b/activerecord/lib/active_record/relation/batches.rb @@ -2,6 +2,8 @@ require "active_record/relation/batches/batch_enumerator" module ActiveRecord module Batches + ORDER_IGNORE_MESSAGE = "Scoped order is ignored, it's forced to be batch order." + # Looping through a collection of records from the database # (using the Scoping::Named::ClassMethods.all method, for example) # is very inefficient since it will try to instantiate all the objects at once. @@ -31,13 +33,20 @@ module ActiveRecord # * <tt>:batch_size</tt> - Specifies the size of the batch. Default to 1000. # * <tt>:start</tt> - Specifies the primary key value to start from, inclusive of the value. # * <tt>:finish</tt> - Specifies the primary key value to end at, inclusive of the value. - # This is especially useful if you want multiple workers dealing with - # the same processing queue. You can make worker 1 handle all the records - # between id 0 and 10,000 and worker 2 handle from 10,000 and beyond - # (by setting the +:start+ and +:finish+ option on each worker). + # * <tt>:error_on_ignore</tt> - Overrides the application config to specify if an error should be raised when + # an order is present in the relation. + # + # Limits are honored, and if present there is no requirement for the batch + # size, it can be less than, equal, or greater than the limit. + # + # The options +start+ and +finish+ are especially useful if you want + # multiple workers dealing with the same processing queue. You can make + # worker 1 handle all the records between id 1 and 9999 and worker 2 + # handle from 10000 and beyond by setting the +:start+ and +:finish+ + # option on each worker. # - # # Let's process for a batch of 2000 records, skipping the first 2000 rows - # Person.find_each(start: 2000, batch_size: 2000) do |person| + # # Let's process from record 10_000 on. + # Person.find_each(start: 10_000) do |person| # person.party_all_night! # end # @@ -46,15 +55,15 @@ module ActiveRecord # work. This also means that this method only works when the primary key is # orderable (e.g. an integer or string). # - # NOTE: You can't set the limit either, that's used to control - # the batch sizes. - def find_each(start: nil, finish: nil, batch_size: 1000) + # NOTE: By its nature, batch processing is subject to race conditions if + # other processes are modifying the database. + def find_each(start: nil, finish: nil, batch_size: 1000, error_on_ignore: nil) if block_given? - find_in_batches(start: start, finish: finish, batch_size: batch_size) do |records| + find_in_batches(start: start, finish: finish, batch_size: batch_size, error_on_ignore: error_on_ignore) do |records| records.each { |record| yield record } end else - enum_for(:find_each, start: start, finish: finish, batch_size: batch_size) do + enum_for(:find_each, start: start, finish: finish, batch_size: batch_size, error_on_ignore: error_on_ignore) do relation = self apply_limits(relation, start, finish).size end @@ -83,13 +92,20 @@ module ActiveRecord # * <tt>:batch_size</tt> - Specifies the size of the batch. Default to 1000. # * <tt>:start</tt> - Specifies the primary key value to start from, inclusive of the value. # * <tt>:finish</tt> - Specifies the primary key value to end at, inclusive of the value. - # This is especially useful if you want multiple workers dealing with - # the same processing queue. You can make worker 1 handle all the records - # between id 0 and 10,000 and worker 2 handle from 10,000 and beyond - # (by setting the +:start+ and +:finish+ option on each worker). + # * <tt>:error_on_ignore</tt> - Overrides the application config to specify if an error should be raised when + # an order is present in the relation. + # + # Limits are honored, and if present there is no requirement for the batch + # size, it can be less than, equal, or greater than the limit. + # + # The options +start+ and +finish+ are especially useful if you want + # multiple workers dealing with the same processing queue. You can make + # worker 1 handle all the records between id 1 and 9999 and worker 2 + # handle from 10000 and beyond by setting the +:start+ and +:finish+ + # option on each worker. # - # # Let's process the next 2000 records - # Person.find_in_batches(start: 2000, batch_size: 2000) do |group| + # # Let's process from record 10_000 on. + # Person.find_in_batches(start: 10_000) do |group| # group.each { |person| person.party_all_night! } # end # @@ -98,18 +114,18 @@ module ActiveRecord # work. This also means that this method only works when the primary key is # orderable (e.g. an integer or string). # - # NOTE: You can't set the limit either, that's used to control - # the batch sizes. - def find_in_batches(start: nil, finish: nil, batch_size: 1000) + # NOTE: By its nature, batch processing is subject to race conditions if + # other processes are modifying the database. + def find_in_batches(start: nil, finish: nil, batch_size: 1000, error_on_ignore: nil) relation = self unless block_given? - return to_enum(:find_in_batches, start: start, finish: finish, batch_size: batch_size) do + return to_enum(:find_in_batches, start: start, finish: finish, batch_size: batch_size, error_on_ignore: error_on_ignore) do total = apply_limits(relation, start, finish).size (total - 1).div(batch_size) + 1 end end - in_batches(of: batch_size, start: start, finish: finish, load: true) do |batch| + in_batches(of: batch_size, start: start, finish: finish, load: true, error_on_ignore: error_on_ignore) do |batch| yield batch.to_a end end @@ -140,16 +156,20 @@ module ActiveRecord # * <tt>:load</tt> - Specifies if the relation should be loaded. Default to false. # * <tt>:start</tt> - Specifies the primary key value to start from, inclusive of the value. # * <tt>:finish</tt> - Specifies the primary key value to end at, inclusive of the value. + # * <tt>:error_on_ignore</tt> - Overrides the application config to specify if an error should be raised when + # an order is present in the relation. # - # This is especially useful if you want to work with the - # ActiveRecord::Relation object instead of the array of records, or if - # you want multiple workers dealing with the same processing queue. You can - # make worker 1 handle all the records between id 0 and 10,000 and worker 2 - # handle from 10,000 and beyond (by setting the +:start+ and +:finish+ - # option on each worker). + # Limits are honored, and if present there is no requirement for the batch + # size, it can be less than, equal, or greater than the limit. # - # # Let's process the next 2000 records - # Person.in_batches(of: 2000, start: 2000).update_all(awesome: true) + # The options +start+ and +finish+ are especially useful if you want + # multiple workers dealing with the same processing queue. You can make + # worker 1 handle all the records between id 1 and 9999 and worker 2 + # handle from 10000 and beyond by setting the +:start+ and +:finish+ + # option on each worker. + # + # # Let's process from record 10_000 on. + # Person.in_batches(start: 10_000).update_all(awesome: true) # # An example of calling where query method on the relation: # @@ -169,31 +189,37 @@ module ActiveRecord # consistent. Therefore the primary key must be orderable, e.g an integer # or a string. # - # NOTE: You can't set the limit either, that's used to control the batch - # sizes. - def in_batches(of: 1000, start: nil, finish: nil, load: false) + # NOTE: By its nature, batch processing is subject to race conditions if + # other processes are modifying the database. + def in_batches(of: 1000, start: nil, finish: nil, load: false, error_on_ignore: nil) relation = self unless block_given? return BatchEnumerator.new(of: of, start: start, finish: finish, relation: self) end - if logger && (arel.orders.present? || arel.taken.present?) - logger.warn("Scoped order and limit are ignored, it's forced to be batch order and batch size") + if arel.orders.present? + act_on_ignored_order(error_on_ignore) + end + + batch_limit = of + if limit_value + remaining = limit_value + batch_limit = remaining if remaining < batch_limit end - relation = relation.reorder(batch_order).limit(of) + relation = relation.reorder(batch_order).limit(batch_limit) relation = apply_limits(relation, start, finish) batch_relation = relation loop do if load - records = batch_relation.to_a + records = batch_relation.records ids = records.map(&:id) - yielded_relation = self.where(primary_key => ids) + yielded_relation = where(primary_key => ids) yielded_relation.load_records(records) else ids = batch_relation.pluck(primary_key) - yielded_relation = self.where(primary_key => ids) + yielded_relation = where(primary_key => ids) end break if ids.empty? @@ -203,21 +229,44 @@ module ActiveRecord yield yielded_relation - break if ids.length < of + break if ids.length < batch_limit + + if limit_value + remaining -= ids.length + + if remaining == 0 + # Saves a useless iteration when the limit is a multiple of the + # batch size. + break + elsif remaining < batch_limit + relation = relation.limit(remaining) + end + end + batch_relation = relation.where(arel_attribute(primary_key).gt(primary_key_offset)) end end private - def apply_limits(relation, start, finish) - relation = relation.where(arel_attribute(primary_key).gteq(start)) if start - relation = relation.where(arel_attribute(primary_key).lteq(finish)) if finish - relation - end + def apply_limits(relation, start, finish) + relation = relation.where(arel_attribute(primary_key).gteq(start)) if start + relation = relation.where(arel_attribute(primary_key).lteq(finish)) if finish + relation + end - def batch_order - "#{quoted_table_name}.#{quoted_primary_key} ASC" - end + def batch_order + "#{quoted_table_name}.#{quoted_primary_key} ASC" + end + + def act_on_ignored_order(error_on_ignore) + raise_error = (error_on_ignore.nil? ? self.klass.error_on_ignored_order : error_on_ignore) + + if raise_error + raise ArgumentError.new(ORDER_IGNORE_MESSAGE) + elsif logger + logger.warn(ORDER_IGNORE_MESSAGE) + end + end end end |