From 48c0906a7ebb084165c3039c9f2385b780acd1a5 Mon Sep 17 00:00:00 2001 From: Norman Clarke Date: Wed, 14 Apr 2010 11:12:07 -0300 Subject: [PATCH] Delegate Inflector.transliterate to i18n. [#4508 state:resolved] Ancillary changes: Moved Chars#normalize into a class method; removed unused UTF_PAT constant. --- .../lib/active_support/inflector/transliterate.rb | 94 ++++++++++++-------- .../lib/active_support/multibyte/chars.rb | 45 ++++++---- activesupport/test/transliterate_test.rb | 51 ++++------- 3 files changed, 101 insertions(+), 89 deletions(-) diff --git a/activesupport/lib/active_support/inflector/transliterate.rb b/activesupport/lib/active_support/inflector/transliterate.rb index 9c99dcf..5ec8737 100644 --- a/activesupport/lib/active_support/inflector/transliterate.rb +++ b/activesupport/lib/active_support/inflector/transliterate.rb @@ -3,45 +3,62 @@ require 'active_support/core_ext/string/multibyte' module ActiveSupport module Inflector - extend self - # UTF-8 byte => ASCII approximate UTF-8 byte(s) - ASCII_APPROXIMATIONS = { - 198 => [65, 69], # Æ => AE - 208 => 68, # Ð => D - 216 => 79, # Ø => O - 222 => [84, 104], # Þ => Þ - 223 => [115, 115], # ß => ss - 230 => [97, 101], # æ => ae - 240 => 100, # ð => d - 248 => 111, # ø => o - 254 => [116, 104], # þ => th - 272 => 68, # Đ => D - 273 => 100, # đ => đ - 294 => 72, # Ħ => H - 295 => 104, # ħ => h - 305 => 105, # ı => i - 306 => [73, 74], # IJ =>IJ - 307 => [105, 106], # ij => ij - 312 => 107, # ĸ => k - 319 => 76, # Ŀ => L - 320 => 108, # ŀ => l - 321 => 76, # Ł => L - 322 => 108, # ł => l - 329 => 110, # ʼn => n - 330 => [78, 71], # Ŋ => NG - 331 => [110, 103], # ŋ => ng - 338 => [79, 69], # Œ => OE - 339 => [111, 101], # œ => oe - 358 => 84, # Ŧ => T - 359 => 116 # ŧ => t - } - - # Replaces accented characters with an ASCII approximation, or deletes it if none exsits. - def transliterate(string) - ActiveSupport::Multibyte::Chars.new(string).tidy_bytes.normalize(:d).unpack("U*").map do |char| - ASCII_APPROXIMATIONS[char] || (char if char < 128) - end.compact.flatten.pack("U*") + # Replaces non-ASCII characters with an ASCII approximation, or if none + # exists, a replacement character which defaults to "?". + # + # transliterate("Ærøskøbing") + # # => "AEroskobing" + # + # Default approximations are provided for Western/Latin characters, + # e.g, "ø", "ñ", "é", "ß", etc. + # + # This method is I18n aware, so you can set up custom approximations for a + # locale. This can be useful, for example, to transliterate German's "ü" + # and "ö" to "ue" and "oe", or to add support for transliterating Russian + # to ASCII. + # + # In order to make your custom transliterations available, you must set + # them as the i18n.transliterate.rule i18n key: + # + # # Store the transliterations in locales/de.yml + # i18n: + # transliterate: + # ü: "ue" + # ö: "oe" + # + # # Or set them using Ruby + # I18n.backend.store_translations(:de, :i18n => { + # :transliterate => { + # :rule => { + # "ü" => "ue", + # "ö" => "oe" + # } + # } + # }) + # + # The value for i18n.transliterate.rule can be a simple Hash that maps + # characters to ASCII approximations as shown above, or, for more complex + # requirements, a Proc: + # + # I18n.backend.store_translations(:de, :i18n => { + # :transliterate => { + # :rule => lambda {|string| MyTransliterator.transliterate(string)} + # } + # }) + # + # Now you can have different transliterations for each locale: + # + # I18n.locale = :en + # transliterate("Jürgen") + # # => "Jurgen" + # + # I18n.locale = :de + # transliterate("Jürgen") + # # => "Juergen" + def transliterate(string, replacement = "?") + I18n.transliterate(Multibyte::Chars.normalize( + Multibyte::Chars.tidy_bytes(string), :c), :replacement => replacement) end # Replaces special characters in a string so that it may be used as part of a 'pretty' URL. @@ -73,5 +90,6 @@ module ActiveSupport end parameterized_string.downcase end + end end diff --git a/activesupport/lib/active_support/multibyte/chars.rb b/activesupport/lib/active_support/multibyte/chars.rb index 4ade115..cca30d1 100644 --- a/activesupport/lib/active_support/multibyte/chars.rb +++ b/activesupport/lib/active_support/multibyte/chars.rb @@ -75,8 +75,6 @@ module ActiveSupport #:nodoc: UNICODE_TRAILERS_PAT = /(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+\Z/u UNICODE_LEADERS_PAT = /\A(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+/u - UTF8_PAT = ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'] - attr_reader :wrapped_string alias to_s wrapped_string alias to_str wrapped_string @@ -409,25 +407,11 @@ module ActiveSupport #:nodoc: # Returns the KC normalization of the string by default. NFKC is considered the best normalization form for # passing strings to databases and validations. # - # * str - The string to perform normalization on. # * form - The form you want to normalize in. Should be one of the following: # :c, :kc, :d, or :kd. Default is # ActiveSupport::Multibyte.default_normalization_form def normalize(form=ActiveSupport::Multibyte.default_normalization_form) - # See http://www.unicode.org/reports/tr15, Table 1 - codepoints = self.class.u_unpack(@wrapped_string) - chars(case form - when :d - self.class.reorder_characters(self.class.decompose_codepoints(:canonical, codepoints)) - when :c - self.class.compose_codepoints(self.class.reorder_characters(self.class.decompose_codepoints(:canonical, codepoints))) - when :kd - self.class.reorder_characters(self.class.decompose_codepoints(:compatability, codepoints)) - when :kc - self.class.compose_codepoints(self.class.reorder_characters(self.class.decompose_codepoints(:compatability, codepoints))) - else - raise ArgumentError, "#{form} is not a valid normalization variant", caller - end.pack('U*')) + chars(self.class.normalize(@wrapped_string, form)) end # Performs canonical decomposition on all the characters. @@ -659,7 +643,7 @@ module ActiveSupport #:nodoc: # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string. # - # Passing +true+ will forcibly tidy all bytes, assuming that the string's encoding is entirely CP-1252 or ISO-8859-1. + # Passing +true+ will forcibly tidy all bytes, assuming that the string's encoding is entirely CP1252 or ISO-8859-1. def tidy_bytes(string, force = false) if force return string.unpack("C*").map do |b| @@ -708,6 +692,31 @@ module ActiveSupport #:nodoc: end bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*") end + + # Returns the KC normalization of the string by default. NFKC is considered the best normalization form for + # passing strings to databases and validations. + # + # * string - The string to perform normalization on. + # * form - The form you want to normalize in. Should be one of the following: + # :c, :kc, :d, or :kd. Default is + # ActiveSupport::Multibyte.default_normalization_form + def normalize(string, form=ActiveSupport::Multibyte.default_normalization_form) + # See http://www.unicode.org/reports/tr15, Table 1 + codepoints = u_unpack(string) + case form + when :d + reorder_characters(decompose_codepoints(:canonical, codepoints)) + when :c + compose_codepoints(reorder_characters(decompose_codepoints(:canonical, codepoints))) + when :kd + reorder_characters(decompose_codepoints(:compatability, codepoints)) + when :kc + compose_codepoints(reorder_characters(decompose_codepoints(:compatability, codepoints))) + else + raise ArgumentError, "#{form} is not a valid normalization variant", caller + end.pack('U*') + end + end protected diff --git a/activesupport/test/transliterate_test.rb b/activesupport/test/transliterate_test.rb index d689b6b..b054855 100644 --- a/activesupport/test/transliterate_test.rb +++ b/activesupport/test/transliterate_test.rb @@ -4,36 +4,6 @@ require 'active_support/inflector/transliterate' class TransliterateTest < Test::Unit::TestCase - APPROXIMATIONS = { - "À"=>"A", "Á"=>"A", "Â"=>"A", "Ã"=>"A", "Ä"=>"A", "Å"=>"A", "Æ"=>"AE", - "Ç"=>"C", "È"=>"E", "É"=>"E", "Ê"=>"E", "Ë"=>"E", "Ì"=>"I", "Í"=>"I", - "Î"=>"I", "Ï"=>"I", "Ð"=>"D", "Ñ"=>"N", "Ò"=>"O", "Ó"=>"O", "Ô"=>"O", - "Õ"=>"O", "Ö"=>"O", "Ø"=>"O", "Ù"=>"U", "Ú"=>"U", "Û"=>"U", "Ü"=>"U", - "Ý"=>"Y", "Þ"=>"Th", "ß"=>"ss", "à"=>"a", "á"=>"a", "â"=>"a", "ã"=>"a", - "ä"=>"a", "å"=>"a", "æ"=>"ae", "ç"=>"c", "è"=>"e", "é"=>"e", "ê"=>"e", - "ë"=>"e", "ì"=>"i", "í"=>"i", "î"=>"i", "ï"=>"i", "ð"=>"d", "ñ"=>"n", - "ò"=>"o", "ó"=>"o", "ô"=>"o", "õ"=>"o", "ö"=>"o", "ø"=>"o", "ù"=>"u", - "ú"=>"u", "û"=>"u", "ü"=>"u", "ý"=>"y", "þ"=>"th", "ÿ"=>"y", "Ā"=>"A", - "ā"=>"a", "Ă"=>"A", "ă"=>"a", "Ą"=>"A", "ą"=>"a", "Ć"=>"C", "ć"=>"c", - "Ĉ"=>"C", "ĉ"=>"c", "Ċ"=>"C", "ċ"=>"c", "Č"=>"C", "č"=>"c", "Ď"=>"D", - "ď"=>"d", "Đ"=>"D", "đ"=>"d", "Ē"=>"E", "ē"=>"e", "Ĕ"=>"E", "ĕ"=>"e", - "Ė"=>"E", "ė"=>"e", "Ę"=>"E", "ę"=>"e", "Ě"=>"E", "ě"=>"e", "Ĝ"=>"G", - "ĝ"=>"g", "Ğ"=>"G", "ğ"=>"g", "Ġ"=>"G", "ġ"=>"g", "Ģ"=>"G", "ģ"=>"g", - "Ĥ"=>"H", "ĥ"=>"h", "Ħ"=>"H", "ħ"=>"h", "Ĩ"=>"I", "ĩ"=>"i", "Ī"=>"I", - "ī"=>"i", "Ĭ"=>"I", "ĭ"=>"i", "Į"=>"I", "į"=>"i", "İ"=>"I", "ı"=>"i", - "IJ"=>"IJ", "ij"=>"ij", "Ĵ"=>"J", "ĵ"=>"j", "Ķ"=>"K", "ķ"=>"k", "ĸ"=>"k", - "Ĺ"=>"L", "ĺ"=>"l", "Ļ"=>"L", "ļ"=>"l", "Ľ"=>"L", "ľ"=>"l", "Ŀ"=>"L", - "ŀ"=>"l", "Ł"=>"L", "ł"=>"l", "Ń"=>"N", "ń"=>"n", "Ņ"=>"N", "ņ"=>"n", - "Ň"=>"N", "ň"=>"n", "ʼn"=>"n", "Ŋ"=>"NG", "ŋ"=>"ng", "Ō"=>"O", "ō"=>"o", - "Ŏ"=>"O", "ŏ"=>"o", "Ő"=>"O", "ő"=>"o", "Œ"=>"OE", "œ"=>"oe", "Ŕ"=>"R", - "ŕ"=>"r", "Ŗ"=>"R", "ŗ"=>"r", "Ř"=>"R", "ř"=>"r", "Ś"=>"S", "ś"=>"s", - "Ŝ"=>"S", "ŝ"=>"s", "Ş"=>"S", "ş"=>"s", "Š"=>"S", "š"=>"s", "Ţ"=>"T", - "ţ"=>"t", "Ť"=>"T", "ť"=>"t", "Ŧ"=>"T", "ŧ"=>"t", "Ũ"=>"U", "ũ"=>"u", - "Ū"=>"U", "ū"=>"u", "Ŭ"=>"U", "ŭ"=>"u", "Ů"=>"U", "ů"=>"u", "Ű"=>"U", - "ű"=>"u", "Ų"=>"U", "ų"=>"u", "Ŵ"=>"W", "ŵ"=>"w", "Ŷ"=>"Y", "ŷ"=>"y", - "Ÿ"=>"Y", "Ź"=>"Z", "ź"=>"z", "Ż"=>"Z", "ż"=>"z", "Ž"=>"Z", "ž"=>"z" - } - def test_transliterate_should_not_change_ascii_chars (0..127).each do |byte| char = [byte].pack("U") @@ -41,10 +11,25 @@ class TransliterateTest < Test::Unit::TestCase end end - def test_should_convert_accented_chars_to_approximate_ascii_chars - APPROXIMATIONS.each do |given, expected| - assert_equal expected, ActiveSupport::Inflector.transliterate(given) + def test_transliterate_should_approximate_ascii + # create string with range of Unicode"s western characters with + # diacritics, excluding the division and multiplication signs which for + # some reason or other are floating in the middle of all the letters. + string = (0xC0..0x17E).to_a.reject {|c| [0xD7, 0xF7].include? c}.pack("U*") + string.each_char do |char| + assert_match %r{^[a-zA-Z']*$}, ActiveSupport::Inflector.transliterate(string) end end + def test_transliterate_should_work_with_custom_i18n_rules_and_uncomposed_utf8 + char = [117, 776].pack("U*") # "ü" as ASCII "u" plus COMBINING DIAERESIS + I18n.backend.store_translations(:de, :i18n => {:transliterate => {:rule => {"ü" => "ue"}}}) + I18n.locale = :de + assert_equal "ue", ActiveSupport::Inflector.transliterate(char) + end + + def test_transliterate_should_allow_a_custom_replacement_char + assert_equal "a*b", ActiveSupport::Inflector.transliterate("a索b", "*") + end + end -- 1.7.0.3