From 911f3db00abb7b35b400973c032e4e5c340bce6f Mon Sep 17 00:00:00 2001 From: Michael Koziarski Date: Tue, 17 Oct 2006 08:29:16 +0000 Subject: Ensure Chars#tidy_bytes only tidies broken bytes. Closes #6397 [Manfred Stienstra] git-svn-id: http://svn-commit.rubyonrails.org/rails/trunk@5316 5ecf4fe2-1ee6-0310-87b1-e25e094e27de --- .../active_support/multibyte/handlers/utf8_handler.rb | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'activesupport/lib/active_support/multibyte/handlers') diff --git a/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb b/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb index 6c8eb88702..5b64734297 100644 --- a/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb +++ b/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb @@ -259,13 +259,18 @@ module ActiveSupport::Multibyte::Handlers g_unpack(str).length end - # Strips all the non-utf-8 bytes from the string resulting in a valid utf-8 string + # Replaces all the non-utf-8 bytes by their iso-8859-1 or cp1252 equivalent resulting in a valid utf-8 string def tidy_bytes(str) - str.unpack('C*').map { |n| - n < 128 ? n.chr : - n < 160 ? [UCD.cp1252[n] || n].pack('U') : - n < 192 ? "\xC2" + n.chr : "\xC3" + (n-64).chr - }.join + str.split(//u).map do |c| + if !UTF8_PAT.match(c) + n = c.unpack('C')[0] + n < 128 ? n.chr : + n < 160 ? [UCD.cp1252[n] || n].pack('U') : + n < 192 ? "\xC2" + n.chr : "\xC3" + (n-64).chr + else + c + end + end.join end protected -- cgit v1.2.3