aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--activesupport/CHANGELOG.md16
-rwxr-xr-xactivesupport/bin/generate_tables4
-rw-r--r--activesupport/lib/active_support/multibyte/unicode.rb26
-rw-r--r--activesupport/lib/active_support/values/unicode_tables.datbin1068675 -> 1116857 bytes
-rw-r--r--activesupport/test/multibyte_test_helpers.rb2
5 files changed, 38 insertions, 10 deletions
diff --git a/activesupport/CHANGELOG.md b/activesupport/CHANGELOG.md
index 5207194fba..5af97e3d37 100644
--- a/activesupport/CHANGELOG.md
+++ b/activesupport/CHANGELOG.md
@@ -1,3 +1,19 @@
+* Updated Unicode version to 9.0.0
+
+ Now we can handle new emojis such like "๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ" ("\u{1F469}\u{200D}\u{1F469}\u{200D}\u{1F467}\u{200D}\u{1F466}").
+
+ version 8.0.0
+
+ "๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ".mb_chars.grapheme_length # => 4
+ "๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ".mb_chars.reverse # => "๐Ÿ‘ฆ๐Ÿ‘งโ€๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€"
+
+ version 9.0.0
+
+ "๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ".mb_chars.grapheme_length # => 1
+ "๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ".mb_chars.reverse # => "๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ"
+
+ *Fumiaki MATSUSHIMA*
+
* Changed `ActiveSupport::Inflector#transliterate` to raise `ArgumentError` when it receives
anything except a string.
diff --git a/activesupport/bin/generate_tables b/activesupport/bin/generate_tables
index 5d912f375c..aa36a01b5b 100755
--- a/activesupport/bin/generate_tables
+++ b/activesupport/bin/generate_tables
@@ -8,6 +8,7 @@ end
require "open-uri"
require "tmpdir"
+require "fileutils"
module ActiveSupport
module Multibyte
@@ -101,9 +102,10 @@ module ActiveSupport
def parse
SOURCES.each do |type, url|
- filename = File.join(Dir.tmpdir, "#{url.split('/').last}")
+ filename = File.join(Dir.tmpdir, UNICODE_VERSION, "#{url.split('/').last}")
unless File.exist?(filename)
$stderr.puts "Downloading #{url.split('/').last}"
+ FileUtils.mkdir_p(File.dirname(filename))
File.open(filename, "wb") do |target|
open(url) do |source|
source.each_line { |line| target.write line }
diff --git a/activesupport/lib/active_support/multibyte/unicode.rb b/activesupport/lib/active_support/multibyte/unicode.rb
index 05cfb249c3..0912912aba 100644
--- a/activesupport/lib/active_support/multibyte/unicode.rb
+++ b/activesupport/lib/active_support/multibyte/unicode.rb
@@ -9,7 +9,7 @@ module ActiveSupport
NORMALIZATION_FORMS = [:c, :kc, :d, :kd]
# The Unicode version that is supported by the implementation
- UNICODE_VERSION = "8.0.0"
+ UNICODE_VERSION = "9.0.0"
# The default normalization used for operations that require
# normalization. It can be set to any of the normalizations
@@ -57,9 +57,12 @@ module ActiveSupport
previous = codepoints[pos - 1]
current = codepoints[pos]
+ # See http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
should_break =
+ if pos == eoc
+ true
# GB3. CR X LF
- if previous == database.boundary[:cr] && current == database.boundary[:lf]
+ elsif previous == database.boundary[:cr] && current == database.boundary[:lf]
false
# GB4. (Control|CR|LF) รท
elsif previous && in_char_class?(previous, [:control, :cr, :lf])
@@ -76,11 +79,8 @@ module ActiveSupport
# GB8. (LVT|T) X (T)
elsif in_char_class?(previous, [:lvt, :t]) && database.boundary[:t] === current
false
- # GB8a. Regional_Indicator X Regional_Indicator
- elsif database.boundary[:regional_indicator] === previous && database.boundary[:regional_indicator] === current
- false
- # GB9. X Extend
- elsif database.boundary[:extend] === current
+ # GB9. X (Extend | ZWJ)
+ elsif in_char_class?(current, [:extend, :zwj])
false
# GB9a. X SpacingMark
elsif database.boundary[:spacingmark] === current
@@ -88,7 +88,17 @@ module ActiveSupport
# GB9b. Prepend X
elsif database.boundary[:prepend] === previous
false
- # GB10. Any รท Any
+ # GB10. (E_Base | EBG) Extend* X E_Modifier
+ elsif (marker...pos).any? { |i| in_char_class?(codepoints[i], [:e_base, :e_base_gaz]) && codepoints[i + 1...pos].all? { |c| database.boundary[:extend] === c } } && database.boundary[:e_modifier] === current
+ false
+ # GB11. ZWJ X (Glue_After_Zwj | EBG)
+ elsif database.boundary[:zwj] === previous && in_char_class?(current, [:glue_after_zwj, :e_base_gaz])
+ false
+ # GB12. ^ (RI RI)* RI X RI
+ # GB13. [^RI] (RI RI)* RI X RI
+ elsif codepoints[marker..pos].all? { |c| database.boundary[:regional_indicator] === c } && codepoints[marker..pos].count { |c| database.boundary[:regional_indicator] === c }.even?
+ false
+ # GB999. Any รท Any
else
true
end
diff --git a/activesupport/lib/active_support/values/unicode_tables.dat b/activesupport/lib/active_support/values/unicode_tables.dat
index dd2c178fb6..f7d9c48bbe 100644
--- a/activesupport/lib/active_support/values/unicode_tables.dat
+++ b/activesupport/lib/active_support/values/unicode_tables.dat
Binary files differ
diff --git a/activesupport/test/multibyte_test_helpers.rb b/activesupport/test/multibyte_test_helpers.rb
index 2201860d8a..a70516bb08 100644
--- a/activesupport/test/multibyte_test_helpers.rb
+++ b/activesupport/test/multibyte_test_helpers.rb
@@ -18,7 +18,7 @@ module MultibyteTestHelpers
end
UNIDATA_URL = "http://www.unicode.org/Public/#{ActiveSupport::Multibyte::Unicode::UNICODE_VERSION}/ucd"
- CACHE_DIR = "#{Dir.tmpdir}/cache/unicode_conformance"
+ CACHE_DIR = "#{Dir.tmpdir}/cache/unicode_conformance/#{ActiveSupport::Multibyte::Unicode::UNICODE_VERSION}"
FileUtils.mkdir_p(CACHE_DIR)
UNICODE_STRING = "ใ“ใซใกใ‚".freeze