aboutsummaryrefslogtreecommitdiffstats
path: root/activesupport/test/multibyte_grapheme_break_conformance.rb
blob: 7d185e2cae29e0293ce9ea3a643487217439d761 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# encoding: utf-8

require 'abstract_unit'

require 'fileutils'
require 'open-uri'
require 'tmpdir'

class Downloader
  def self.download(from, to)
    unless File.exist?(to)
      $stderr.puts "Downloading #{from} to #{to}"
      unless File.exist?(File.dirname(to))
        system "mkdir -p #{File.dirname(to)}"
      end
      open(from) do |source|
        File.open(to, 'w') do |target|
          source.each_line do |l|
            target.write l
          end
        end
       end
     end
  end
end

class MultibyteGraphemeBreakConformanceTest < ActiveSupport::TestCase
  TEST_DATA_URL = "http://www.unicode.org/Public/#{ActiveSupport::Multibyte::Unicode::UNICODE_VERSION}/ucd/auxiliary"
  TEST_DATA_FILE = '/GraphemeBreakTest.txt'
  CACHE_DIR = File.join(Dir.tmpdir, 'cache')

  def setup
    FileUtils.mkdir_p(CACHE_DIR)
    Downloader.download(TEST_DATA_URL + TEST_DATA_FILE, CACHE_DIR + TEST_DATA_FILE)
  end

  def test_breaks
    each_line_of_break_tests do |*cols|
      *clusters, comment = *cols
      packed = ActiveSupport::Multibyte::Unicode.pack_graphemes(clusters)
      assert_equal clusters, ActiveSupport::Multibyte::Unicode.unpack_graphemes(packed), comment
    end
  end

  protected
    def each_line_of_break_tests(&block)
      lines = 0
      max_test_lines = 0 # Don't limit below 21, because that's the header of the testfile
      File.open(File.join(CACHE_DIR, TEST_DATA_FILE), 'r') do | f |
        until f.eof? || (max_test_lines > 21 and lines > max_test_lines)
          lines += 1
          line = f.gets.chomp!
          next if (line.empty? || line =~ /^\#/)

          cols, comment = line.split("#")
          # Cluster breaks are represented by ÷
          clusters = cols.split("÷").map{|e| e.strip}.reject{|e| e.empty? }
          clusters = clusters.map do |cluster|
            # Codepoints within each cluster are separated by ×
            codepoints = cluster.split("×").map{|e| e.strip}.reject{|e| e.empty? }
            # codepoints are in hex in the test suite, pack wants them as integers
            codepoints.map{|codepoint| codepoint.to_i(16)}
          end

          # The tests contain a solitary U+D800 <Non Private Use High
          # Surrogate, First> character, which Ruby does not allow to stand
          # alone in a UTF-8 string. So we'll just skip it.
          next if clusters.flatten.include?(0xd800)

          clusters << comment.strip

          yield(*clusters)
        end
      end
    end
end