aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAdam Roben <adam@roben.org>2013-11-13 17:16:55 -0500
committerAdam Roben <adam@roben.org>2013-11-13 17:19:30 -0500
commit3ca6d1fb4a3acc7a6dfff9ee39ee7f75fa71d0f4 (patch)
tree3bf3d6cba50862bc30907f59f8560534d8cacb2b
parente63aad97bd871f73016227ea28a775d0d37c9c0c (diff)
downloadrails-3ca6d1fb4a3acc7a6dfff9ee39ee7f75fa71d0f4.tar.gz
rails-3ca6d1fb4a3acc7a6dfff9ee39ee7f75fa71d0f4.tar.bz2
rails-3ca6d1fb4a3acc7a6dfff9ee39ee7f75fa71d0f4.zip
Support extended grapheme clusters and UAX 29
http://www.unicode.org/reports/tr29/tr29-21.html is the version of UAX 29 that corresponds to Unicode 6.2.0. Unicode.unpack_graphemes now implements all the rules listed there, including the ones for extended grapheme clusters. I added a new optional test, test/multibyte_grapheme_break_conformance.rb, that is heavily based on test/multibyte_normalization_conformance.rb, which runs the Unicode test suite.
-rw-r--r--activesupport/lib/active_support/multibyte/unicode.rb15
-rw-r--r--activesupport/test/multibyte_grapheme_break_conformance.rb76
2 files changed, 91 insertions, 0 deletions
diff --git a/activesupport/lib/active_support/multibyte/unicode.rb b/activesupport/lib/active_support/multibyte/unicode.rb
index aea7709b55..d85ea3b5e6 100644
--- a/activesupport/lib/active_support/multibyte/unicode.rb
+++ b/activesupport/lib/active_support/multibyte/unicode.rb
@@ -94,6 +94,12 @@ module ActiveSupport
# GB3. CR X LF
if previous == database.boundary[:cr] and current == database.boundary[:lf]
false
+ # GB4. (Control|CR|LF) ÷
+ elsif previous and in_char_class?(previous, [:control,:cr,:lf])
+ true
+ # GB5. ÷ (Control|CR|LF)
+ elsif in_char_class?(current, [:control,:cr,:lf])
+ true
# GB6. L X (L|V|LV|LVT)
elsif database.boundary[:l] === previous and in_char_class?(current, [:l,:v,:lv,:lvt])
false
@@ -103,9 +109,18 @@ module ActiveSupport
# GB8. (LVT|T) X (T)
elsif in_char_class?(previous, [:lvt,:t]) and database.boundary[:t] === current
false
+ # GB8a. Regional_Indicator X Regional_Indicator
+ elsif database.boundary[:regional_indicator] === previous and database.boundary[:regional_indicator] === current
+ false
# GB9. X Extend
elsif database.boundary[:extend] === current
false
+ # GB9a. X SpacingMark
+ elsif database.boundary[:spacingmark] === current
+ false
+ # GB9b. Prepend X
+ elsif database.boundary[:prepend] === previous
+ false
# GB10. Any ÷ Any
else
true
diff --git a/activesupport/test/multibyte_grapheme_break_conformance.rb b/activesupport/test/multibyte_grapheme_break_conformance.rb
new file mode 100644
index 0000000000..7d185e2cae
--- /dev/null
+++ b/activesupport/test/multibyte_grapheme_break_conformance.rb
@@ -0,0 +1,76 @@
+# encoding: utf-8
+
+require 'abstract_unit'
+
+require 'fileutils'
+require 'open-uri'
+require 'tmpdir'
+
+class Downloader
+ def self.download(from, to)
+ unless File.exist?(to)
+ $stderr.puts "Downloading #{from} to #{to}"
+ unless File.exist?(File.dirname(to))
+ system "mkdir -p #{File.dirname(to)}"
+ end
+ open(from) do |source|
+ File.open(to, 'w') do |target|
+ source.each_line do |l|
+ target.write l
+ end
+ end
+ end
+ end
+ end
+end
+
+class MultibyteGraphemeBreakConformanceTest < ActiveSupport::TestCase
+ TEST_DATA_URL = "http://www.unicode.org/Public/#{ActiveSupport::Multibyte::Unicode::UNICODE_VERSION}/ucd/auxiliary"
+ TEST_DATA_FILE = '/GraphemeBreakTest.txt'
+ CACHE_DIR = File.join(Dir.tmpdir, 'cache')
+
+ def setup
+ FileUtils.mkdir_p(CACHE_DIR)
+ Downloader.download(TEST_DATA_URL + TEST_DATA_FILE, CACHE_DIR + TEST_DATA_FILE)
+ end
+
+ def test_breaks
+ each_line_of_break_tests do |*cols|
+ *clusters, comment = *cols
+ packed = ActiveSupport::Multibyte::Unicode.pack_graphemes(clusters)
+ assert_equal clusters, ActiveSupport::Multibyte::Unicode.unpack_graphemes(packed), comment
+ end
+ end
+
+ protected
+ def each_line_of_break_tests(&block)
+ lines = 0
+ max_test_lines = 0 # Don't limit below 21, because that's the header of the testfile
+ File.open(File.join(CACHE_DIR, TEST_DATA_FILE), 'r') do | f |
+ until f.eof? || (max_test_lines > 21 and lines > max_test_lines)
+ lines += 1
+ line = f.gets.chomp!
+ next if (line.empty? || line =~ /^\#/)
+
+ cols, comment = line.split("#")
+ # Cluster breaks are represented by ÷
+ clusters = cols.split("÷").map{|e| e.strip}.reject{|e| e.empty? }
+ clusters = clusters.map do |cluster|
+ # Codepoints within each cluster are separated by ×
+ codepoints = cluster.split("×").map{|e| e.strip}.reject{|e| e.empty? }
+ # codepoints are in hex in the test suite, pack wants them as integers
+ codepoints.map{|codepoint| codepoint.to_i(16)}
+ end
+
+ # The tests contain a solitary U+D800 <Non Private Use High
+ # Surrogate, First> character, which Ruby does not allow to stand
+ # alone in a UTF-8 string. So we'll just skip it.
+ next if clusters.flatten.include?(0xd800)
+
+ clusters << comment.strip
+
+ yield(*clusters)
+ end
+ end
+ end
+end