diff options
Diffstat (limited to 'activesupport')
-rw-r--r-- | activesupport/CHANGELOG.md | 5 | ||||
-rw-r--r-- | activesupport/MIT-LICENSE | 4 | ||||
-rw-r--r-- | activesupport/lib/active_support.rb | 2 | ||||
-rw-r--r-- | activesupport/lib/active_support/multibyte/unicode.rb | 51 | ||||
-rw-r--r-- | activesupport/test/multibyte_chars_test.rb | 32 | ||||
-rw-r--r-- | activesupport/test/multibyte_grapheme_break_conformance.rb | 76 | ||||
-rw-r--r-- | activesupport/test/multibyte_normalization_conformance.rb | 129 |
7 files changed, 280 insertions, 19 deletions
diff --git a/activesupport/CHANGELOG.md b/activesupport/CHANGELOG.md index cee1e57c93..4963dd25d1 100644 --- a/activesupport/CHANGELOG.md +++ b/activesupport/CHANGELOG.md @@ -1,3 +1,8 @@ +* Support extended grapheme clusters and UAX 29. + + *Adam Roben* + + ## Rails 5.0.0.beta1 (December 18, 2015) ## * Add petabyte and exabyte numeric conversion. diff --git a/activesupport/MIT-LICENSE b/activesupport/MIT-LICENSE index 7bffebb076..40235833ba 100644 --- a/activesupport/MIT-LICENSE +++ b/activesupport/MIT-LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2005-2015 David Heinemeier Hansson +Copyright (c) 2005-2016 David Heinemeier Hansson Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -17,4 +17,4 @@ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION -WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
\ No newline at end of file +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/activesupport/lib/active_support.rb b/activesupport/lib/active_support.rb index 2019afeb00..94fe893149 100644 --- a/activesupport/lib/active_support.rb +++ b/activesupport/lib/active_support.rb @@ -1,5 +1,5 @@ #-- -# Copyright (c) 2005-2015 David Heinemeier Hansson +# Copyright (c) 2005-2016 David Heinemeier Hansson # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the diff --git a/activesupport/lib/active_support/multibyte/unicode.rb b/activesupport/lib/active_support/multibyte/unicode.rb index 586002b03b..72b20fff06 100644 --- a/activesupport/lib/active_support/multibyte/unicode.rb +++ b/activesupport/lib/active_support/multibyte/unicode.rb @@ -87,19 +87,44 @@ module ActiveSupport pos += 1 previous = codepoints[pos-1] current = codepoints[pos] - if ( - # CR X LF - ( previous == database.boundary[:cr] and current == database.boundary[:lf] ) or - # L X (L|V|LV|LVT) - ( database.boundary[:l] === previous and in_char_class?(current, [:l,:v,:lv,:lvt]) ) or - # (LV|V) X (V|T) - ( in_char_class?(previous, [:lv,:v]) and in_char_class?(current, [:v,:t]) ) or - # (LVT|T) X (T) - ( in_char_class?(previous, [:lvt,:t]) and database.boundary[:t] === current ) or - # X Extend - (database.boundary[:extend] === current) - ) - else + + should_break = + # GB3. CR X LF + if previous == database.boundary[:cr] and current == database.boundary[:lf] + false + # GB4. (Control|CR|LF) ÷ + elsif previous and in_char_class?(previous, [:control,:cr,:lf]) + true + # GB5. ÷ (Control|CR|LF) + elsif in_char_class?(current, [:control,:cr,:lf]) + true + # GB6. L X (L|V|LV|LVT) + elsif database.boundary[:l] === previous and in_char_class?(current, [:l,:v,:lv,:lvt]) + false + # GB7. (LV|V) X (V|T) + elsif in_char_class?(previous, [:lv,:v]) and in_char_class?(current, [:v,:t]) + false + # GB8. (LVT|T) X (T) + elsif in_char_class?(previous, [:lvt,:t]) and database.boundary[:t] === current + false + # GB8a. Regional_Indicator X Regional_Indicator + elsif database.boundary[:regional_indicator] === previous and database.boundary[:regional_indicator] === current + false + # GB9. X Extend + elsif database.boundary[:extend] === current + false + # GB9a. X SpacingMark + elsif database.boundary[:spacingmark] === current + false + # GB9b. Prepend X + elsif database.boundary[:prepend] === previous + false + # GB10. Any ÷ Any + else + true + end + + if should_break unpacked << codepoints[marker..pos-1] marker = pos end diff --git a/activesupport/test/multibyte_chars_test.rb b/activesupport/test/multibyte_chars_test.rb index 8d4d9d736c..c1e0b19248 100644 --- a/activesupport/test/multibyte_chars_test.rb +++ b/activesupport/test/multibyte_chars_test.rb @@ -612,28 +612,54 @@ class MultibyteCharsExtrasTest < ActiveSupport::TestCase ['abc', 3], ['こにちわ', 4], [[0x0924, 0x094D, 0x0930].pack('U*'), 2], + # GB3 [%w(cr lf), 1], + # GB4 + [%w(cr n), 2], + [%w(lf n), 2], + [%w(control n), 2], + [%w(cr extend), 2], + [%w(lf extend), 2], + [%w(control extend), 2], + # GB 5 + [%w(n cr), 2], + [%w(n lf), 2], + [%w(n control), 2], + [%w(extend cr), 2], + [%w(extend lf), 2], + [%w(extend control), 2], + # GB 6 [%w(l l), 1], [%w(l v), 1], [%w(l lv), 1], [%w(l lvt), 1], + # GB7 [%w(lv v), 1], [%w(lv t), 1], [%w(v v), 1], [%w(v t), 1], + # GB8 [%w(lvt t), 1], [%w(t t), 1], + # GB8a + [%w(r r), 1], + # GB9 [%w(n extend), 1], + # GB9a + [%w(n spacingmark), 1], + # GB10 [%w(n n), 2], + # Other [%w(n cr lf n), 3], - [%w(n l v t), 2] + [%w(n l v t), 2], + [%w(cr extend n), 3], ].each do |input, expected_length| if input.kind_of?(Array) str = string_from_classes(input) else str = input end - assert_equal expected_length, chars(str).grapheme_length + assert_equal expected_length, chars(str).grapheme_length, input.inspect end end @@ -698,7 +724,7 @@ class MultibyteCharsExtrasTest < ActiveSupport::TestCase # Characters from the character classes as described in UAX #29 character_from_class = { :l => 0x1100, :v => 0x1160, :t => 0x11A8, :lv => 0xAC00, :lvt => 0xAC01, :cr => 0x000D, :lf => 0x000A, - :extend => 0x094D, :n => 0x64 + :extend => 0x094D, :n => 0x64, :spacingmark => 0x0903, :r => 0x1F1E6, :control => 0x0001 } classes.collect do |k| character_from_class[k.intern] diff --git a/activesupport/test/multibyte_grapheme_break_conformance.rb b/activesupport/test/multibyte_grapheme_break_conformance.rb new file mode 100644 index 0000000000..7d185e2cae --- /dev/null +++ b/activesupport/test/multibyte_grapheme_break_conformance.rb @@ -0,0 +1,76 @@ +# encoding: utf-8 + +require 'abstract_unit' + +require 'fileutils' +require 'open-uri' +require 'tmpdir' + +class Downloader + def self.download(from, to) + unless File.exist?(to) + $stderr.puts "Downloading #{from} to #{to}" + unless File.exist?(File.dirname(to)) + system "mkdir -p #{File.dirname(to)}" + end + open(from) do |source| + File.open(to, 'w') do |target| + source.each_line do |l| + target.write l + end + end + end + end + end +end + +class MultibyteGraphemeBreakConformanceTest < ActiveSupport::TestCase + TEST_DATA_URL = "http://www.unicode.org/Public/#{ActiveSupport::Multibyte::Unicode::UNICODE_VERSION}/ucd/auxiliary" + TEST_DATA_FILE = '/GraphemeBreakTest.txt' + CACHE_DIR = File.join(Dir.tmpdir, 'cache') + + def setup + FileUtils.mkdir_p(CACHE_DIR) + Downloader.download(TEST_DATA_URL + TEST_DATA_FILE, CACHE_DIR + TEST_DATA_FILE) + end + + def test_breaks + each_line_of_break_tests do |*cols| + *clusters, comment = *cols + packed = ActiveSupport::Multibyte::Unicode.pack_graphemes(clusters) + assert_equal clusters, ActiveSupport::Multibyte::Unicode.unpack_graphemes(packed), comment + end + end + + protected + def each_line_of_break_tests(&block) + lines = 0 + max_test_lines = 0 # Don't limit below 21, because that's the header of the testfile + File.open(File.join(CACHE_DIR, TEST_DATA_FILE), 'r') do | f | + until f.eof? || (max_test_lines > 21 and lines > max_test_lines) + lines += 1 + line = f.gets.chomp! + next if (line.empty? || line =~ /^\#/) + + cols, comment = line.split("#") + # Cluster breaks are represented by ÷ + clusters = cols.split("÷").map{|e| e.strip}.reject{|e| e.empty? } + clusters = clusters.map do |cluster| + # Codepoints within each cluster are separated by × + codepoints = cluster.split("×").map{|e| e.strip}.reject{|e| e.empty? } + # codepoints are in hex in the test suite, pack wants them as integers + codepoints.map{|codepoint| codepoint.to_i(16)} + end + + # The tests contain a solitary U+D800 <Non Private Use High + # Surrogate, First> character, which Ruby does not allow to stand + # alone in a UTF-8 string. So we'll just skip it. + next if clusters.flatten.include?(0xd800) + + clusters << comment.strip + + yield(*clusters) + end + end + end +end diff --git a/activesupport/test/multibyte_normalization_conformance.rb b/activesupport/test/multibyte_normalization_conformance.rb new file mode 100644 index 0000000000..839aec7fa8 --- /dev/null +++ b/activesupport/test/multibyte_normalization_conformance.rb @@ -0,0 +1,129 @@ +# encoding: utf-8 + +require 'abstract_unit' +require 'multibyte_test_helpers' + +require 'fileutils' +require 'open-uri' +require 'tmpdir' + +class Downloader + def self.download(from, to) + unless File.exist?(to) + $stderr.puts "Downloading #{from} to #{to}" + unless File.exist?(File.dirname(to)) + system "mkdir -p #{File.dirname(to)}" + end + open(from) do |source| + File.open(to, 'w') do |target| + source.each_line do |l| + target.write l + end + end + end + end + end +end + +class MultibyteNormalizationConformanceTest < ActiveSupport::TestCase + include MultibyteTestHelpers + + UNIDATA_URL = "http://www.unicode.org/Public/#{ActiveSupport::Multibyte::Unicode::UNICODE_VERSION}/ucd" + UNIDATA_FILE = '/NormalizationTest.txt' + CACHE_DIR = File.join(Dir.tmpdir, 'cache') + + def setup + FileUtils.mkdir_p(CACHE_DIR) + Downloader.download(UNIDATA_URL + UNIDATA_FILE, CACHE_DIR + UNIDATA_FILE) + @proxy = ActiveSupport::Multibyte::Chars + end + + def test_normalizations_C + each_line_of_norm_tests do |*cols| + col1, col2, col3, col4, col5, comment = *cols + + # CONFORMANCE: + # 1. The following invariants must be true for all conformant implementations + # + # NFC + # c2 == NFC(c1) == NFC(c2) == NFC(c3) + assert_equal_codepoints col2, @proxy.new(col1).normalize(:c), "Form C - Col 2 has to be NFC(1) - #{comment}" + assert_equal_codepoints col2, @proxy.new(col2).normalize(:c), "Form C - Col 2 has to be NFC(2) - #{comment}" + assert_equal_codepoints col2, @proxy.new(col3).normalize(:c), "Form C - Col 2 has to be NFC(3) - #{comment}" + # + # c4 == NFC(c4) == NFC(c5) + assert_equal_codepoints col4, @proxy.new(col4).normalize(:c), "Form C - Col 4 has to be C(4) - #{comment}" + assert_equal_codepoints col4, @proxy.new(col5).normalize(:c), "Form C - Col 4 has to be C(5) - #{comment}" + end + end + + def test_normalizations_D + each_line_of_norm_tests do |*cols| + col1, col2, col3, col4, col5, comment = *cols + # + # NFD + # c3 == NFD(c1) == NFD(c2) == NFD(c3) + assert_equal_codepoints col3, @proxy.new(col1).normalize(:d), "Form D - Col 3 has to be NFD(1) - #{comment}" + assert_equal_codepoints col3, @proxy.new(col2).normalize(:d), "Form D - Col 3 has to be NFD(2) - #{comment}" + assert_equal_codepoints col3, @proxy.new(col3).normalize(:d), "Form D - Col 3 has to be NFD(3) - #{comment}" + # c5 == NFD(c4) == NFD(c5) + assert_equal_codepoints col5, @proxy.new(col4).normalize(:d), "Form D - Col 5 has to be NFD(4) - #{comment}" + assert_equal_codepoints col5, @proxy.new(col5).normalize(:d), "Form D - Col 5 has to be NFD(5) - #{comment}" + end + end + + def test_normalizations_KC + each_line_of_norm_tests do | *cols | + col1, col2, col3, col4, col5, comment = *cols + # + # NFKC + # c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5) + assert_equal_codepoints col4, @proxy.new(col1).normalize(:kc), "Form D - Col 4 has to be NFKC(1) - #{comment}" + assert_equal_codepoints col4, @proxy.new(col2).normalize(:kc), "Form D - Col 4 has to be NFKC(2) - #{comment}" + assert_equal_codepoints col4, @proxy.new(col3).normalize(:kc), "Form D - Col 4 has to be NFKC(3) - #{comment}" + assert_equal_codepoints col4, @proxy.new(col4).normalize(:kc), "Form D - Col 4 has to be NFKC(4) - #{comment}" + assert_equal_codepoints col4, @proxy.new(col5).normalize(:kc), "Form D - Col 4 has to be NFKC(5) - #{comment}" + end + end + + def test_normalizations_KD + each_line_of_norm_tests do | *cols | + col1, col2, col3, col4, col5, comment = *cols + # + # NFKD + # c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5) + assert_equal_codepoints col5, @proxy.new(col1).normalize(:kd), "Form KD - Col 5 has to be NFKD(1) - #{comment}" + assert_equal_codepoints col5, @proxy.new(col2).normalize(:kd), "Form KD - Col 5 has to be NFKD(2) - #{comment}" + assert_equal_codepoints col5, @proxy.new(col3).normalize(:kd), "Form KD - Col 5 has to be NFKD(3) - #{comment}" + assert_equal_codepoints col5, @proxy.new(col4).normalize(:kd), "Form KD - Col 5 has to be NFKD(4) - #{comment}" + assert_equal_codepoints col5, @proxy.new(col5).normalize(:kd), "Form KD - Col 5 has to be NFKD(5) - #{comment}" + end + end + + protected + def each_line_of_norm_tests(&block) + lines = 0 + max_test_lines = 0 # Don't limit below 38, because that's the header of the testfile + File.open(File.join(CACHE_DIR, UNIDATA_FILE), 'r') do | f | + until f.eof? || (max_test_lines > 38 and lines > max_test_lines) + lines += 1 + line = f.gets.chomp! + next if (line.empty? || line =~ /^\#/) + + cols, comment = line.split("#") + cols = cols.split(";").map{|e| e.strip}.reject{|e| e.empty? } + next unless cols.length == 5 + + # codepoints are in hex in the test suite, pack wants them as integers + cols.map!{|c| c.split.map{|codepoint| codepoint.to_i(16)}.pack("U*") } + cols << comment + + yield(*cols) + end + end + end + + def inspect_codepoints(str) + str.to_s.unpack("U*").map{|cp| cp.to_s(16) }.join(' ') + end +end |