aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNorman Clarke <norman@njclarke.com>2010-04-12 12:44:25 -0300
committerJeremy Kemper <jeremy@bitsweat.net>2010-04-12 23:19:39 -0700
commitdceef0828a23e8298dd9a9aab1a33c49e84f17d6 (patch)
treec3fcba59013a1f543df7cfffda35c0ef4688b010
parent36f3634a6afbaf36015abb531d6bea6360654b81 (diff)
downloadrails-dceef0828a23e8298dd9a9aab1a33c49e84f17d6.tar.gz
rails-dceef0828a23e8298dd9a9aab1a33c49e84f17d6.tar.bz2
rails-dceef0828a23e8298dd9a9aab1a33c49e84f17d6.zip
Improve reliability of Inflector.transliterate. [#4374 state:resolved]
Signed-off-by: Jeremy Kemper <jeremy@bitsweat.net>
-rw-r--r--activesupport/CHANGELOG2
-rw-r--r--activesupport/lib/active_support/inflector/transliterate.rb61
-rw-r--r--activesupport/test/inflector_test_cases.rb5
-rw-r--r--activesupport/test/transliterate_test.rb50
4 files changed, 93 insertions, 25 deletions
diff --git a/activesupport/CHANGELOG b/activesupport/CHANGELOG
index a5a7a9b904..749e59e091 100644
--- a/activesupport/CHANGELOG
+++ b/activesupport/CHANGELOG
@@ -1,5 +1,7 @@
*Rails 3.0.0 [beta 3] (pending)*
+* Improve transliteration quality. #4374 [Norman Clarke]
+
* Speed up and add Ruby 1.9 support for ActiveSupport::Multibyte::Chars#tidy_bytes. #4350 [Norman Clarke]
diff --git a/activesupport/lib/active_support/inflector/transliterate.rb b/activesupport/lib/active_support/inflector/transliterate.rb
index ca591abc7d..9c99dcfb01 100644
--- a/activesupport/lib/active_support/inflector/transliterate.rb
+++ b/activesupport/lib/active_support/inflector/transliterate.rb
@@ -1,32 +1,47 @@
# encoding: utf-8
-require 'iconv'
-require 'kconv'
require 'active_support/core_ext/string/multibyte'
module ActiveSupport
module Inflector
extend self
-
- # Replaces accented characters with their ascii equivalents.
- def transliterate(string)
- Iconv.iconv('ascii//ignore//translit', 'utf-8', string).to_s
- end
- if RUBY_VERSION >= '1.9'
- undef_method :transliterate
- def transliterate(string)
- proxy = ActiveSupport::Multibyte.proxy_class.new(string)
- proxy.normalize(:kd).gsub(/[^\x00-\x7F]+/, '')
- end
+ # UTF-8 byte => ASCII approximate UTF-8 byte(s)
+ ASCII_APPROXIMATIONS = {
+ 198 => [65, 69], # Æ => AE
+ 208 => 68, # Ð => D
+ 216 => 79, # Ø => O
+ 222 => [84, 104], # Þ => Þ
+ 223 => [115, 115], # ß => ss
+ 230 => [97, 101], # æ => ae
+ 240 => 100, # ð => d
+ 248 => 111, # ø => o
+ 254 => [116, 104], # þ => th
+ 272 => 68, # Đ => D
+ 273 => 100, # đ => đ
+ 294 => 72, # Ħ => H
+ 295 => 104, # ħ => h
+ 305 => 105, # ı => i
+ 306 => [73, 74], # IJ =>IJ
+ 307 => [105, 106], # ij => ij
+ 312 => 107, # ĸ => k
+ 319 => 76, # Ŀ => L
+ 320 => 108, # ŀ => l
+ 321 => 76, # Ł => L
+ 322 => 108, # ł => l
+ 329 => 110, # ʼn => n
+ 330 => [78, 71], # Ŋ => NG
+ 331 => [110, 103], # ŋ => ng
+ 338 => [79, 69], # Œ => OE
+ 339 => [111, 101], # œ => oe
+ 358 => 84, # Ŧ => T
+ 359 => 116 # ŧ => t
+ }
- # The iconv transliteration code doesn't function correctly
- # on some platforms, but it's very fast where it does function.
- elsif "foo" != (Inflector.transliterate("föö") rescue nil)
- undef_method :transliterate
- def transliterate(string)
- string.mb_chars.normalize(:kd). # Decompose accented characters
- gsub(/[^\x00-\x7F]+/, '') # Remove anything non-ASCII entirely (e.g. diacritics).
- end
+ # Replaces accented characters with an ASCII approximation, or deletes it if none exsits.
+ def transliterate(string)
+ ActiveSupport::Multibyte::Chars.new(string).tidy_bytes.normalize(:d).unpack("U*").map do |char|
+ ASCII_APPROXIMATIONS[char] || (char if char < 128)
+ end.compact.flatten.pack("U*")
end
# Replaces special characters in a string so that it may be used as part of a 'pretty' URL.
@@ -45,8 +60,6 @@ module ActiveSupport
# <%= link_to(@person.name, person_path(@person)) %>
# # => <a href="/person/1-donald-e-knuth">Donald E. Knuth</a>
def parameterize(string, sep = '-')
- # remove malformed utf8 characters
- string = string.toutf8 unless string.is_utf8?
# replace accented chars with their ascii equivalents
parameterized_string = transliterate(string)
# Turn unwanted chars into the separator
@@ -59,6 +72,6 @@ module ActiveSupport
parameterized_string.gsub!(/^#{re_sep}|#{re_sep}$/i, '')
end
parameterized_string.downcase
- end
+ end
end
end
diff --git a/activesupport/test/inflector_test_cases.rb b/activesupport/test/inflector_test_cases.rb
index 29f87ac302..59515dad32 100644
--- a/activesupport/test/inflector_test_cases.rb
+++ b/activesupport/test/inflector_test_cases.rb
@@ -188,7 +188,10 @@ module InflectorTestCases
StringToParameterizedAndNormalized = {
"Malmö" => "malmo",
"Garçons" => "garcons",
- "Ops\331" => "ops"
+ "Ops\331" => "opsu",
+ "Ærøskøbing" => "aeroskobing",
+ "Aßlar" => "asslar",
+ "Japanese: 日本語" => "japanese"
}
UnderscoreToHuman = {
diff --git a/activesupport/test/transliterate_test.rb b/activesupport/test/transliterate_test.rb
new file mode 100644
index 0000000000..d689b6be73
--- /dev/null
+++ b/activesupport/test/transliterate_test.rb
@@ -0,0 +1,50 @@
+# encoding: utf-8
+require 'abstract_unit'
+require 'active_support/inflector/transliterate'
+
+class TransliterateTest < Test::Unit::TestCase
+
+ APPROXIMATIONS = {
+ "À"=>"A", "Á"=>"A", "Â"=>"A", "Ã"=>"A", "Ä"=>"A", "Å"=>"A", "Æ"=>"AE",
+ "Ç"=>"C", "È"=>"E", "É"=>"E", "Ê"=>"E", "Ë"=>"E", "Ì"=>"I", "Í"=>"I",
+ "Î"=>"I", "Ï"=>"I", "Ð"=>"D", "Ñ"=>"N", "Ò"=>"O", "Ó"=>"O", "Ô"=>"O",
+ "Õ"=>"O", "Ö"=>"O", "Ø"=>"O", "Ù"=>"U", "Ú"=>"U", "Û"=>"U", "Ü"=>"U",
+ "Ý"=>"Y", "Þ"=>"Th", "ß"=>"ss", "à"=>"a", "á"=>"a", "â"=>"a", "ã"=>"a",
+ "ä"=>"a", "å"=>"a", "æ"=>"ae", "ç"=>"c", "è"=>"e", "é"=>"e", "ê"=>"e",
+ "ë"=>"e", "ì"=>"i", "í"=>"i", "î"=>"i", "ï"=>"i", "ð"=>"d", "ñ"=>"n",
+ "ò"=>"o", "ó"=>"o", "ô"=>"o", "õ"=>"o", "ö"=>"o", "ø"=>"o", "ù"=>"u",
+ "ú"=>"u", "û"=>"u", "ü"=>"u", "ý"=>"y", "þ"=>"th", "ÿ"=>"y", "Ā"=>"A",
+ "ā"=>"a", "Ă"=>"A", "ă"=>"a", "Ą"=>"A", "ą"=>"a", "Ć"=>"C", "ć"=>"c",
+ "Ĉ"=>"C", "ĉ"=>"c", "Ċ"=>"C", "ċ"=>"c", "Č"=>"C", "č"=>"c", "Ď"=>"D",
+ "ď"=>"d", "Đ"=>"D", "đ"=>"d", "Ē"=>"E", "ē"=>"e", "Ĕ"=>"E", "ĕ"=>"e",
+ "Ė"=>"E", "ė"=>"e", "Ę"=>"E", "ę"=>"e", "Ě"=>"E", "ě"=>"e", "Ĝ"=>"G",
+ "ĝ"=>"g", "Ğ"=>"G", "ğ"=>"g", "Ġ"=>"G", "ġ"=>"g", "Ģ"=>"G", "ģ"=>"g",
+ "Ĥ"=>"H", "ĥ"=>"h", "Ħ"=>"H", "ħ"=>"h", "Ĩ"=>"I", "ĩ"=>"i", "Ī"=>"I",
+ "ī"=>"i", "Ĭ"=>"I", "ĭ"=>"i", "Į"=>"I", "į"=>"i", "İ"=>"I", "ı"=>"i",
+ "IJ"=>"IJ", "ij"=>"ij", "Ĵ"=>"J", "ĵ"=>"j", "Ķ"=>"K", "ķ"=>"k", "ĸ"=>"k",
+ "Ĺ"=>"L", "ĺ"=>"l", "Ļ"=>"L", "ļ"=>"l", "Ľ"=>"L", "ľ"=>"l", "Ŀ"=>"L",
+ "ŀ"=>"l", "Ł"=>"L", "ł"=>"l", "Ń"=>"N", "ń"=>"n", "Ņ"=>"N", "ņ"=>"n",
+ "Ň"=>"N", "ň"=>"n", "ʼn"=>"n", "Ŋ"=>"NG", "ŋ"=>"ng", "Ō"=>"O", "ō"=>"o",
+ "Ŏ"=>"O", "ŏ"=>"o", "Ő"=>"O", "ő"=>"o", "Œ"=>"OE", "œ"=>"oe", "Ŕ"=>"R",
+ "ŕ"=>"r", "Ŗ"=>"R", "ŗ"=>"r", "Ř"=>"R", "ř"=>"r", "Ś"=>"S", "ś"=>"s",
+ "Ŝ"=>"S", "ŝ"=>"s", "Ş"=>"S", "ş"=>"s", "Š"=>"S", "š"=>"s", "Ţ"=>"T",
+ "ţ"=>"t", "Ť"=>"T", "ť"=>"t", "Ŧ"=>"T", "ŧ"=>"t", "Ũ"=>"U", "ũ"=>"u",
+ "Ū"=>"U", "ū"=>"u", "Ŭ"=>"U", "ŭ"=>"u", "Ů"=>"U", "ů"=>"u", "Ű"=>"U",
+ "ű"=>"u", "Ų"=>"U", "ų"=>"u", "Ŵ"=>"W", "ŵ"=>"w", "Ŷ"=>"Y", "ŷ"=>"y",
+ "Ÿ"=>"Y", "Ź"=>"Z", "ź"=>"z", "Ż"=>"Z", "ż"=>"z", "Ž"=>"Z", "ž"=>"z"
+ }
+
+ def test_transliterate_should_not_change_ascii_chars
+ (0..127).each do |byte|
+ char = [byte].pack("U")
+ assert_equal char, ActiveSupport::Inflector.transliterate(char)
+ end
+ end
+
+ def test_should_convert_accented_chars_to_approximate_ascii_chars
+ APPROXIMATIONS.each do |given, expected|
+ assert_equal expected, ActiveSupport::Inflector.transliterate(given)
+ end
+ end
+
+end