From 369daa530dd9db7bbba79b8c75012e0fa84c9f48 Mon Sep 17 00:00:00 2001 From: Cliff Pruitt Date: Thu, 25 Jul 2019 14:51:42 -0400 Subject: Handle US-ASCII strings with invalid characters in transliterate US-ASCII is a subset of UTF-8 so we can temporarily convert US-ASCII strings to UTF-8 to perform the transliteration. After we've converted characters to ASCII representations, we can set the encoding back to US-ASCII to return the same encoding we accepted. --- .../lib/active_support/inflector/transliterate.rb | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'activesupport/lib') diff --git a/activesupport/lib/active_support/inflector/transliterate.rb b/activesupport/lib/active_support/inflector/transliterate.rb index 61651ba101..a6f57d73ac 100644 --- a/activesupport/lib/active_support/inflector/transliterate.rb +++ b/activesupport/lib/active_support/inflector/transliterate.rb @@ -61,13 +61,26 @@ module ActiveSupport # supported and will raise an ArgumentError. def transliterate(string, replacement = "?", locale: nil) raise ArgumentError, "Can only transliterate strings. Received #{string.class.name}" unless string.is_a?(String) - raise ArgumentError, "Can not transliterate strings with ASCII-8BIT encoding" if string.encoding == ::Encoding::ASCII_8BIT - I18n.transliterate( + allowed_encodings = [Encoding::UTF_8, Encoding::US_ASCII, Encoding::GB18030] + raise ArgumentError, "Can not transliterate strings with #{string.encoding} encoding" unless allowed_encodings.include?(string.encoding) + + input_encoding = string.encoding + + # US-ASCII is a subset so we'll force encoding as UTF-8 if US-ASCII is given + # This way we can hancle invalid bytes in the same way as we do for UTF-8 + string.force_encoding(Encoding::UTF_8) if string.encoding == Encoding::US_ASCII + + transliterated = I18n.transliterate( ActiveSupport::Multibyte::Unicode.tidy_bytes(string).unicode_normalize(:nfc), replacement: replacement, locale: locale ) + + # If we were given US-ASCII we give back US-ASCII + transliterated.force_encoding(Encoding::US_ASCII) if input_encoding == Encoding::US_ASCII + + transliterated end # Replaces special characters in a string so that it may be used as part of -- cgit v1.2.3