aboutsummaryrefslogtreecommitdiffstats
path: root/activesupport/test/multibyte_grapheme_break_conformance_test.rb
blob: 855626e7794534d850f67844c7beefbc62870894 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# encoding: utf-8
# frozen_string_literal: true

require "abstract_unit"
require "multibyte_test_helpers"

require "fileutils"
require "open-uri"
require "tmpdir"

class MultibyteGraphemeBreakConformanceTest < ActiveSupport::TestCase
  include MultibyteTestHelpers

  UNIDATA_FILE = "/auxiliary/GraphemeBreakTest.txt"
  RUN_P = begin
            Downloader.download(UNIDATA_URL + UNIDATA_FILE, CACHE_DIR + UNIDATA_FILE)
          rescue
          end

  def setup
    skip "Unable to download test data" unless RUN_P
  end

  def test_breaks
    each_line_of_break_tests do |*cols|
      *clusters, comment = *cols
      packed = ActiveSupport::Multibyte::Unicode.pack_graphemes(clusters)
      assert_equal clusters, ActiveSupport::Multibyte::Unicode.unpack_graphemes(packed), comment
    end
  end

  private
    def each_line_of_break_tests(&block)
      lines = 0
      max_test_lines = 0 # Don't limit below 21, because that's the header of the testfile
      File.open(File.join(CACHE_DIR, UNIDATA_FILE), "r") do | f |
        until f.eof? || (max_test_lines > 21 && lines > max_test_lines)
          lines += 1
          line = f.gets.chomp!
          next if line.empty? || line.start_with?("#")

          cols, comment = line.split("#")
          # Cluster breaks are represented by ÷
          clusters = cols.split("÷").map { |e| e.strip }.reject { |e| e.empty? }
          clusters = clusters.map do |cluster|
            # Codepoints within each cluster are separated by ×
            codepoints = cluster.split("×").map { |e| e.strip }.reject { |e| e.empty? }
            # codepoints are in hex in the test suite, pack wants them as integers
            codepoints.map { |codepoint| codepoint.to_i(16) }
          end

          # The tests contain a solitary U+D800 <Non Private Use High
          # Surrogate, First> character, which Ruby does not allow to stand
          # alone in a UTF-8 string. So we'll just skip it.
          next if clusters.flatten.include?(0xd800)

          clusters << comment.strip

          yield(*clusters)
        end
      end
    end
end