1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
|
# frozen_string_literal: true
module ActiveSupport
module Multibyte
module Unicode
extend self
# A list of all available normalization forms.
# See https://www.unicode.org/reports/tr15/tr15-29.html for more
# information about normalization.
NORMALIZATION_FORMS = [:c, :kc, :d, :kd]
NORMALIZATION_FORM_ALIASES = { # :nodoc:
c: :nfc,
d: :nfd,
kc: :nfkc,
kd: :nfkd
}
# The Unicode version that is supported by the implementation
UNICODE_VERSION = RbConfig::CONFIG["UNICODE_VERSION"]
# The default normalization used for operations that require
# normalization. It can be set to any of the normalizations
# in NORMALIZATION_FORMS.
#
# ActiveSupport::Multibyte::Unicode.default_normalization_form = :c
attr_accessor :default_normalization_form
@default_normalization_form = :kc
# Unpack the string at grapheme boundaries. Returns a list of character
# lists.
#
# Unicode.unpack_graphemes('क्षि') # => [[2325, 2381], [2359], [2367]]
# Unicode.unpack_graphemes('Café') # => [[67], [97], [102], [233]]
def unpack_graphemes(string)
string.scan(/\X/).map(&:codepoints)
end
# Reverse operation of unpack_graphemes.
#
# Unicode.pack_graphemes(Unicode.unpack_graphemes('क्षि')) # => 'क्षि'
def pack_graphemes(unpacked)
unpacked.flatten.pack("U*")
end
# Decompose composed characters to the decomposed form.
def decompose(type, codepoints)
if type == :compatibility
codepoints.pack("U*").unicode_normalize(:nfkd).codepoints
else
codepoints.pack("U*").unicode_normalize(:nfd).codepoints
end
end
# Compose decomposed characters to the composed form.
def compose(codepoints)
codepoints.pack("U*").unicode_normalize(:nfc).codepoints
end
# Rubinius' String#scrub, however, doesn't support ASCII-incompatible chars.
if !defined?(Rubinius)
# Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent
# resulting in a valid UTF-8 string.
#
# Passing +true+ will forcibly tidy all bytes, assuming that the string's
# encoding is entirely CP1252 or ISO-8859-1.
def tidy_bytes(string, force = false)
return string if string.empty?
return recode_windows1252_chars(string) if force
string.scrub { |bad| recode_windows1252_chars(bad) }
end
else
def tidy_bytes(string, force = false)
return string if string.empty?
return recode_windows1252_chars(string) if force
# We can't transcode to the same format, so we choose a nearly-identical encoding.
# We're going to 'transcode' bytes from UTF-8 when possible, then fall back to
# CP1252 when we get errors. The final string will be 'converted' back to UTF-8
# before returning.
reader = Encoding::Converter.new(Encoding::UTF_8, Encoding::UTF_16LE)
source = string.dup
out = "".force_encoding(Encoding::UTF_16LE)
loop do
reader.primitive_convert(source, out)
_, _, _, error_bytes, _ = reader.primitive_errinfo
break if error_bytes.nil?
out << error_bytes.encode(Encoding::UTF_16LE, Encoding::Windows_1252, invalid: :replace, undef: :replace)
end
reader.finish
out.encode!(Encoding::UTF_8)
end
end
# Returns the KC normalization of the string by default. NFKC is
# considered the best normalization form for passing strings to databases
# and validations.
#
# * <tt>string</tt> - The string to perform normalization on.
# * <tt>form</tt> - The form you want to normalize in. Should be one of
# the following: <tt>:c</tt>, <tt>:kc</tt>, <tt>:d</tt>, or <tt>:kd</tt>.
# Default is ActiveSupport::Multibyte::Unicode.default_normalization_form.
def normalize(string, form = nil)
form ||= @default_normalization_form
# See https://www.unicode.org/reports/tr15, Table 1
if alias_form = NORMALIZATION_FORM_ALIASES[form]
ActiveSupport::Deprecation.warn(<<-MSG.squish)
ActiveSupport::Multibyte::Unicode#normalize is deprecated and will be
removed from Rails 6.1. Use String#unicode_normalize(:#{alias_form}) instead.
MSG
string.unicode_normalize(alias_form)
else
ActiveSupport::Deprecation.warn(<<-MSG.squish)
ActiveSupport::Multibyte::Unicode#normalize is deprecated and will be
removed from Rails 6.1. Use String#unicode_normalize instead.
MSG
raise ArgumentError, "#{form} is not a valid normalization variant", caller
end
end
%w(downcase upcase swapcase).each do |method|
define_method(method) do |string|
ActiveSupport::Deprecation.warn(<<-MSG.squish)
ActiveSupport::Multibyte::Unicode##{method} is deprecated and
will be removed from Rails 6.1. Use String methods directly.
MSG
string.send(method)
end
end
private
def recode_windows1252_chars(string)
string.encode(Encoding::UTF_8, Encoding::Windows_1252, invalid: :replace, undef: :replace)
end
end
end
end
|