aboutsummaryrefslogtreecommitdiffstats
path: root/activesupport/lib/active_support/inflector/transliterate.rb
blob: a6f57d73acade0d2b9d7626b9c69a3929491a04f (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# frozen_string_literal: true

require "active_support/core_ext/string/multibyte"
require "active_support/i18n"

module ActiveSupport
  module Inflector
    # Replaces non-ASCII characters with an ASCII approximation, or if none
    # exists, a replacement character which defaults to "?".
    #
    #    transliterate('Ærøskøbing')
    #    # => "AEroskobing"
    #
    # Default approximations are provided for Western/Latin characters,
    # e.g, "ø", "ñ", "é", "ß", etc.
    #
    # This method is I18n aware, so you can set up custom approximations for a
    # locale. This can be useful, for example, to transliterate German's "ü"
    # and "ö" to "ue" and "oe", or to add support for transliterating Russian
    # to ASCII.
    #
    # In order to make your custom transliterations available, you must set
    # them as the <tt>i18n.transliterate.rule</tt> i18n key:
    #
    #   # Store the transliterations in locales/de.yml
    #   i18n:
    #     transliterate:
    #       rule:
    #         ü: "ue"
    #         ö: "oe"
    #
    #   # Or set them using Ruby
    #   I18n.backend.store_translations(:de, i18n: {
    #     transliterate: {
    #       rule: {
    #         'ü' => 'ue',
    #         'ö' => 'oe'
    #       }
    #     }
    #   })
    #
    # The value for <tt>i18n.transliterate.rule</tt> can be a simple Hash that
    # maps characters to ASCII approximations as shown above, or, for more
    # complex requirements, a Proc:
    #
    #   I18n.backend.store_translations(:de, i18n: {
    #     transliterate: {
    #       rule: ->(string) { MyTransliterator.transliterate(string) }
    #     }
    #   })
    #
    # Now you can have different transliterations for each locale:
    #
    #   transliterate('Jürgen', locale: :en)
    #   # => "Jurgen"
    #
    #   transliterate('Jürgen', locale: :de)
    #   # => "Juergen"
    #
    # Transliteration of ASCII-8BIT / BINARY strings is not
    # supported and will raise an ArgumentError.
    def transliterate(string, replacement = "?", locale: nil)
      raise ArgumentError, "Can only transliterate strings. Received #{string.class.name}" unless string.is_a?(String)

      allowed_encodings = [Encoding::UTF_8, Encoding::US_ASCII, Encoding::GB18030]
      raise ArgumentError, "Can not transliterate strings with #{string.encoding} encoding" unless allowed_encodings.include?(string.encoding)

      input_encoding = string.encoding

      # US-ASCII is a subset so we'll force encoding as UTF-8 if US-ASCII is given
      # This way we can hancle invalid bytes in the same way as we do for UTF-8
      string.force_encoding(Encoding::UTF_8) if string.encoding == Encoding::US_ASCII

      transliterated = I18n.transliterate(
        ActiveSupport::Multibyte::Unicode.tidy_bytes(string).unicode_normalize(:nfc),
        replacement: replacement,
        locale: locale
      )

      # If we were given US-ASCII we give back US-ASCII
      transliterated.force_encoding(Encoding::US_ASCII) if input_encoding == Encoding::US_ASCII

      transliterated
    end

    # Replaces special characters in a string so that it may be used as part of
    # a 'pretty' URL.
    #
    #   parameterize("Donald E. Knuth") # => "donald-e-knuth"
    #   parameterize("^très|Jolie-- ")  # => "tres-jolie"
    #
    # To use a custom separator, override the +separator+ argument.
    #
    #   parameterize("Donald E. Knuth", separator: '_') # => "donald_e_knuth"
    #   parameterize("^très|Jolie__ ", separator: '_')  # => "tres_jolie"
    #
    # To preserve the case of the characters in a string, use the +preserve_case+ argument.
    #
    #   parameterize("Donald E. Knuth", preserve_case: true) # => "Donald-E-Knuth"
    #   parameterize("^très|Jolie-- ", preserve_case: true) # => "tres-Jolie"
    #
    # It preserves dashes and underscores unless they are used as separators:
    #
    #   parameterize("^très|Jolie__ ")                 # => "tres-jolie__"
    #   parameterize("^très|Jolie-- ", separator: "_") # => "tres_jolie--"
    #   parameterize("^très_Jolie-- ", separator: ".") # => "tres_jolie--"
    #
    # If the optional parameter +locale+ is specified,
    # the word will be parameterized as a word of that language.
    # By default, this parameter is set to <tt>nil</tt> and it will use
    # the configured <tt>I18n.locale<tt>.
    def parameterize(string, separator: "-", preserve_case: false, locale: nil)
      # Replace accented chars with their ASCII equivalents.
      parameterized_string = transliterate(string, locale: locale)

      # Turn unwanted chars into the separator.
      parameterized_string.gsub!(/[^a-z0-9\-_]+/i, separator)

      unless separator.nil? || separator.empty?
        if separator == "-"
          re_duplicate_separator        = /-{2,}/
          re_leading_trailing_separator = /^-|-$/i
        else
          re_sep = Regexp.escape(separator)
          re_duplicate_separator        = /#{re_sep}{2,}/
          re_leading_trailing_separator = /^#{re_sep}|#{re_sep}$/i
        end
        # No more than one of the separator in a row.
        parameterized_string.gsub!(re_duplicate_separator, separator)
        # Remove leading/trailing separator.
        parameterized_string.gsub!(re_leading_trailing_separator, "")
      end

      parameterized_string.downcase! unless preserve_case
      parameterized_string
    end
  end
end