aboutsummaryrefslogtreecommitdiffstats
path: root/activesupport/lib/active_support/multibyte
diff options
context:
space:
mode:
Diffstat (limited to 'activesupport/lib/active_support/multibyte')
-rw-r--r--activesupport/lib/active_support/multibyte/chars.rb15
-rw-r--r--activesupport/lib/active_support/multibyte/unicode.rb28
2 files changed, 27 insertions, 16 deletions
diff --git a/activesupport/lib/active_support/multibyte/chars.rb b/activesupport/lib/active_support/multibyte/chars.rb
index 262f25b874..8c58466556 100644
--- a/activesupport/lib/active_support/multibyte/chars.rb
+++ b/activesupport/lib/active_support/multibyte/chars.rb
@@ -16,7 +16,8 @@ module ActiveSupport #:nodoc:
# through the +mb_chars+ method. Methods which would normally return a
# String object now return a Chars object so methods can be chained.
#
- # 'The Perfect String '.mb_chars.downcase.strip.normalize # => "the perfect string"
+ # 'The Perfect String '.mb_chars.downcase.strip.normalize
+ # # => #<ActiveSupport::Multibyte::Chars:0x007fdc434ccc10 @wrapped_string="the perfect string">
#
# Chars objects are perfectly interchangeable with String objects as long as
# no explicit class checks are made. If certain methods do explicitly check
@@ -134,7 +135,7 @@ module ActiveSupport #:nodoc:
# Converts characters in the string to the opposite case.
#
- # 'El Cañón".mb_chars.swapcase.to_s # => "eL cAÑÓN"
+ # 'El Cañón'.mb_chars.swapcase.to_s # => "eL cAÑÓN"
def swapcase
chars Unicode.swapcase(@wrapped_string)
end
@@ -148,8 +149,8 @@ module ActiveSupport #:nodoc:
# Capitalizes the first letter of every word, when possible.
#
- # "ÉL QUE SE ENTERÓ".mb_chars.titleize # => "Él Que Se Enteró"
- # "日本語".mb_chars.titleize # => "日本語"
+ # "ÉL QUE SE ENTERÓ".mb_chars.titleize.to_s # => "Él Que Se Enteró"
+ # "日本語".mb_chars.titleize.to_s # => "日本語"
def titleize
chars(downcase.to_s.gsub(/\b('?\S)/u) { Unicode.upcase($1) })
end
@@ -210,9 +211,9 @@ module ActiveSupport #:nodoc:
end
end
- protected
+ private
- def translate_offset(byte_offset) #:nodoc:
+ def translate_offset(byte_offset)
return nil if byte_offset.nil?
return 0 if @wrapped_string == ""
@@ -224,7 +225,7 @@ module ActiveSupport #:nodoc:
end
end
- def chars(string) #:nodoc:
+ def chars(string)
self.class.new(string)
end
end
diff --git a/activesupport/lib/active_support/multibyte/unicode.rb b/activesupport/lib/active_support/multibyte/unicode.rb
index 7842264b39..0912912aba 100644
--- a/activesupport/lib/active_support/multibyte/unicode.rb
+++ b/activesupport/lib/active_support/multibyte/unicode.rb
@@ -9,7 +9,7 @@ module ActiveSupport
NORMALIZATION_FORMS = [:c, :kc, :d, :kd]
# The Unicode version that is supported by the implementation
- UNICODE_VERSION = "8.0.0"
+ UNICODE_VERSION = "9.0.0"
# The default normalization used for operations that require
# normalization. It can be set to any of the normalizations
@@ -57,9 +57,12 @@ module ActiveSupport
previous = codepoints[pos - 1]
current = codepoints[pos]
+ # See http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
should_break =
+ if pos == eoc
+ true
# GB3. CR X LF
- if previous == database.boundary[:cr] && current == database.boundary[:lf]
+ elsif previous == database.boundary[:cr] && current == database.boundary[:lf]
false
# GB4. (Control|CR|LF) ÷
elsif previous && in_char_class?(previous, [:control, :cr, :lf])
@@ -76,11 +79,8 @@ module ActiveSupport
# GB8. (LVT|T) X (T)
elsif in_char_class?(previous, [:lvt, :t]) && database.boundary[:t] === current
false
- # GB8a. Regional_Indicator X Regional_Indicator
- elsif database.boundary[:regional_indicator] === previous && database.boundary[:regional_indicator] === current
- false
- # GB9. X Extend
- elsif database.boundary[:extend] === current
+ # GB9. X (Extend | ZWJ)
+ elsif in_char_class?(current, [:extend, :zwj])
false
# GB9a. X SpacingMark
elsif database.boundary[:spacingmark] === current
@@ -88,7 +88,17 @@ module ActiveSupport
# GB9b. Prepend X
elsif database.boundary[:prepend] === previous
false
- # GB10. Any ÷ Any
+ # GB10. (E_Base | EBG) Extend* X E_Modifier
+ elsif (marker...pos).any? { |i| in_char_class?(codepoints[i], [:e_base, :e_base_gaz]) && codepoints[i + 1...pos].all? { |c| database.boundary[:extend] === c } } && database.boundary[:e_modifier] === current
+ false
+ # GB11. ZWJ X (Glue_After_Zwj | EBG)
+ elsif database.boundary[:zwj] === previous && in_char_class?(current, [:glue_after_zwj, :e_base_gaz])
+ false
+ # GB12. ^ (RI RI)* RI X RI
+ # GB13. [^RI] (RI RI)* RI X RI
+ elsif codepoints[marker..pos].all? { |c| database.boundary[:regional_indicator] === c } && codepoints[marker..pos].count { |c| database.boundary[:regional_indicator] === c }.even?
+ false
+ # GB999. Any ÷ Any
else
true
end
@@ -358,7 +368,7 @@ module ActiveSupport
private
- def apply_mapping(string, mapping) #:nodoc:
+ def apply_mapping(string, mapping)
database.codepoints
string.each_codepoint.map do |codepoint|
cp = database.codepoints[codepoint]