3 files changed, 227 insertions, 155 deletions
diff --git a/actionpack/lib/action_controller/vendor/html-scanner/html/document.rb b/actionpack/lib/action_controller/vendor/html-scanner/html/document.rb
index 329ab01560..607fd186b9 100644
--- a/actionpack/lib/action_controller/vendor/html-scanner/html/document.rb
+++ b/actionpack/lib/action_controller/vendor/html-scanner/html/document.rb
@@ -1,6 +1,7 @@
 require 'html/tokenizer'
 require 'html/node'
 require 'html/selector'
+require 'html/sanitizer'
 
 module HTML #:nodoc:
   # A top-level HTMl document. You give it a body of text, and it will parse that
diff --git a/actionpack/lib/action_controller/vendor/html-scanner/html/sanitizer.rb b/actionpack/lib/action_controller/vendor/html-scanner/html/sanitizer.rb
new file mode 100644
index 0000000000..377e81aead
--- /dev/null
+++ b/actionpack/lib/action_controller/vendor/html-scanner/html/sanitizer.rb
@@ -0,0 +1,173 @@
+module HTML
+  class Sanitizer
+    def sanitize(text, options = {})
+      return text unless sanitizeable?(text)
+      tokenize(text, options).join
+    end
+    
+    def sanitizeable?(text)
+      !(text.nil? || text.empty? || !text.index("<"))
+    end
+    
+  protected
+    def tokenize(text, options)
+      tokenizer = HTML::Tokenizer.new(text)
+      result = []
+      while token = tokenizer.next
+        node = Node.parse(nil, 0, 0, token, false)
+        process_node node, result, options
+      end
+      result
+    end
+    
+    def process_node(node, result, options)
+      result << node.to_s
+    end
+  end
+  
+  class FullSanitizer < Sanitizer
+    def sanitize(text, options = {})
+      result = super
+      # strip any comments, and if they have a newline at the end (ie. line with
+      # only a comment) strip that too
+      result.gsub!(/<!--(.*?)-->[\n]?/m, "") if result
+      # Recurse - handle all dirty nested tags
+      result == text ? result : sanitize(result, options)
+    end
+    
+    def process_node(node, result, options)
+      result << node.to_s if node.class == HTML::Text
+    end
+  end
+  
+  class LinkSanitizer < FullSanitizer
+    cattr_accessor :included_tags, :instance_writer => false
+    self.included_tags = Set.new(%w(a href))
+
+    def sanitizeable?(text)
+      !(text.nil? || text.empty? || !((text.index("<a") || text.index("<href")) && text.index(">")))
+    end
+    
+  protected
+    def process_node(node, result, options)
+      result << node.to_s unless node.is_a?(HTML::Tag) && included_tags.include?(node.name) 
+    end
+  end
+  
+  class WhiteListSanitizer < Sanitizer
+    [:protocol_separator, :uri_attributes, :allowed_attributes, :allowed_tags, :allowed_protocols, :bad_tags,
+     :allowed_css_properties, :allowed_css_keywords, :shorthand_css_properties].each do |attr|
+      class_inheritable_accessor attr, :instance_writer => false
+    end
+
+    # A regular expression of the valid characters used to separate protocols like
+    # the ':' in 'http://foo.com'
+    self.protocol_separator     = /:|(&#0*58)|(&#x70)|(%|&#37;)3A/
+    
+    # Specifies a Set of HTML attributes that can have URIs.
+    self.uri_attributes         = Set.new(%w(href src cite action longdesc xlink:href lowsrc))
+
+    # Specifies a Set of 'bad' tags that the #sanitize helper will remove completely, as opposed
+    # to just escaping harmless tags like &lt;font&gt;
+    self.bad_tags               = Set.new(%w(script))
+    
+    # Specifies the default Set of tags that the #sanitize helper will allow unscathed.
+    self.allowed_tags           = Set.new(%w(strong em b i p code pre tt output samp kbd var sub 
+      sup dfn cite big small address hr br div span h1 h2 h3 h4 h5 h6 ul ol li dt dd abbr 
+      acronym a img blockquote del ins fieldset legend))
+
+    # Specifies the default Set of html attributes that the #sanitize helper will leave 
+    # in the allowed tag.
+    self.allowed_attributes     = Set.new(%w(href src width height alt cite datetime title class name xml:lang abbr))
+    
+    # Specifies the default Set of acceptable css properties that #sanitize and #sanitize_css will accept.
+    self.allowed_protocols      = Set.new(%w(ed2k ftp http https irc mailto news gopher nntp telnet webcal xmpp callto 
+      feed svn urn aim rsync tag ssh sftp rtsp afs))
+    
+    # Specifies the default Set of acceptable css keywords that #sanitize and #sanitize_css will accept.
+    self.allowed_css_properties = Set.new(%w(azimuth background-color border-bottom-color border-collapse 
+      border-color border-left-color border-right-color border-top-color clear color cursor direction display 
+      elevation float font font-family font-size font-style font-variant font-weight height letter-spacing line-height
+      overflow pause pause-after pause-before pitch pitch-range richness speak speak-header speak-numeral speak-punctuation
+      speech-rate stress text-align text-decoration text-indent unicode-bidi vertical-align voice-family volume white-space
+      width))
+  
+    # Specifies the default Set of acceptable css keywords that #sanitize and #sanitize_css will accept.
+    self.allowed_css_keywords   = Set.new(%w(auto aqua black block blue bold both bottom brown center
+      collapse dashed dotted fuchsia gray green !important italic left lime maroon medium none navy normal
+      nowrap olive pointer purple red right solid silver teal top transparent underline white yellow))
+
+    # Specifies the default Set of allowed shorthand css properties for the #sanitize and #sanitize_css helpers.
+    self.shorthand_css_properties = Set.new(%w(background border margin padding))
+
+    # Sanitizes a block of css code.  Used by #sanitize when it comes across a style attribute
+    def sanitize_css(style)
+      # disallow urls
+      style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
+
+      # gauntlet
+      if style !~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/ ||
+          style !~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
+        return ''
+      end
+
+      clean = []
+      style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop,val|
+        if allowed_css_properties.include?(prop.downcase)
+          clean <<  prop + ': ' + val + ';'
+        elsif shorthand_css_properties.include?(prop.split('-')[0].downcase) 
+          unless val.split().any? do |keyword|
+            !allowed_css_keywords.include?(keyword) && 
+              keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
+          end
+            clean << prop + ': ' + val + ';'
+          end
+        end
+      end
+      clean.join(' ')
+    end
+
+  protected
+    def tokenize(text, options)
+      options[:parent] = []
+      options[:attributes] ||= allowed_attributes
+      options[:tags]       ||= allowed_tags
+      super
+    end
+
+    def process_node(node, result, options)
+      result << case node
+        when HTML::Tag
+          if node.closing == :close
+            options[:parent].shift
+          else
+            options[:parent].unshift node.name
+          end
+          
+          process_attributes_for node, options
+
+          options[:tags].include?(node.name) ? node : nil
+        else
+          bad_tags.include?(options[:parent].first) ? nil : node.to_s.gsub(/</, "&lt;")
+      end
+    end
+    
+    def process_attributes_for(node, options)
+      return unless node.attributes
+      node.attributes.keys.each do |attr_name|
+        value = node.attributes[attr_name].to_s
+
+        if !options[:attributes].include?(attr_name) || contains_bad_protocols?(attr_name, value)
+          node.attributes.delete(attr_name)
+        else
+          node.attributes[attr_name] = attr_name == 'style' ? sanitize_css(value) : CGI::escapeHTML(value)
+        end
+      end
+    end
+
+    def contains_bad_protocols?(attr_name, value)
+      uri_attributes.include?(attr_name) && 
+      (value =~ /(^[^\/:]*):|(&#0*58)|(&#x70)|(%|&#37;)3A/ && !allowed_protocols.include?(value.split(protocol_separator).first))
+    end
+  end
+end
+\ No newline at end of file
diff --git a/actionpack/lib/action_view/helpers/sanitize_helper.rb b/actionpack/lib/action_view/helpers/sanitize_helper.rb
index e67abd9f67..47fbe3a27a 100644
--- a/actionpack/lib/action_view/helpers/sanitize_helper.rb
+++ b/actionpack/lib/action_view/helpers/sanitize_helper.rb
@@ -49,69 +49,12 @@ module ActionView
       #   end
       # 
       def sanitize(html, options = {})
-        return html if html.blank? || !html.include?('<')
-
-        attrs = options[:attributes] || sanitized_allowed_attributes
-        tags  = options[:tags]       || sanitized_allowed_tags
-
-        returning [] do |new_text|
-          tokenizer = HTML::Tokenizer.new(html)
-          parent    = [] 
-
-          while token = tokenizer.next
-            node = HTML::Node.parse(nil, 0, 0, token, false)
-
-            new_text << case node
-              when HTML::Tag
-                if node.closing == :close
-                  parent.shift
-                else
-                  parent.unshift node.name
-                end
-
-                node.attributes.keys.each do |attr_name|
-                  value = node.attributes[attr_name].to_s
-
-                  if !attrs.include?(attr_name) || contains_bad_protocols?(attr_name, value)
-                    node.attributes.delete(attr_name)
-                  else
-                    node.attributes[attr_name] = attr_name == 'style' ? sanitize_css(value) : CGI::escapeHTML(value)
-                  end
-                end if node.attributes
-
-                tags.include?(node.name) ? node : nil
-              else
-                sanitized_bad_tags.include?(parent.first) ? nil : node.to_s.gsub(/</, "&lt;")
-            end
-          end
-        end.join
+        self.class.white_list_sanitizer.sanitize(html, options)
       end
 
       # Sanitizes a block of css code.  Used by #sanitize when it comes across a style attribute
       def sanitize_css(style)
-        # disallow urls
-        style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
-
-        # gauntlet
-        if style !~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/ ||
-            style !~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
-          return ''
-        end
-
-        returning [] do |clean|
-          style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop,val|
-            if sanitized_allowed_css_properties.include?(prop.downcase)
-              clean <<  prop + ': ' + val + ';'
-            elsif sanitized_shorthand_css_properties.include?(prop.split('-')[0].downcase) 
-              unless val.split().any? do |keyword|
-                !sanitized_allowed_css_keywords.include?(keyword) && 
-                  keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
-              end
-                clean << prop + ': ' + val + ';'
-              end
-            end
-          end
-        end.join(' ')
+        self.class.white_list_sanitizer.sanitize_css(style)
       end
 
       # Strips all HTML tags from the +html+, including comments.  This uses the 
@@ -129,23 +72,7 @@ module ActionView
       #   strip_tags("<div id='top-bar'>Welcome to my website!</div>")
       #   # => Welcome to my website!
       def strip_tags(html)     
-        return html if html.blank? || !html.index("<")
-        tokenizer = HTML::Tokenizer.new(html)
-
-        text = returning [] do |text|
-          while token = tokenizer.next
-            node = HTML::Node.parse(nil, 0, 0, token, false)
-            # result is only the content of any Text nodes
-            text << node.to_s if node.class == HTML::Text  
-          end
-        end
-        
-        # strip any comments, and if they have a newline at the end (ie. line with
-        # only a comment) strip that too
-        result = text.join.gsub(/<!--(.*?)-->[\n]?/m, "")
-        
-        # Recurse - handle all dirty nested tags
-        result == html ? result : strip_tags(result)
+        self.class.full_sanitizer.sanitize(html)
       end
 
       # Strips all link tags from +text+ leaving just the link text.
@@ -160,80 +87,57 @@ module ActionView
       #   strip_links('Blog: <a href="http://www.myblog.com/" class="nav" target=\"_blank\">Visit</a>.')
       #   # => Blog: Visit
       def strip_links(html)
-        if !html.blank? && (html.index("<a") || html.index("<href")) && html.index(">")
-          tokenizer = HTML::Tokenizer.new(html)
-          result = returning [] do |result|
-            while token = tokenizer.next 
-              node = HTML::Node.parse(nil, 0, 0, token, false) 
-              result << node.to_s unless node.is_a?(HTML::Tag) && ["a", "href"].include?(node.name) 
-            end 
-          end.join
-          result == html ? result : strip_links(result) # Recurse - handle all dirty nested links
-        else
-          html
-        end
+        self.class.link_sanitizer.sanitize(html)
       end
 
-      # A regular expression of the valid characters used to separate protocols like
-      # the ':' in 'http://foo.com'
-      @@sanitized_protocol_separator = /:|(&#0*58)|(&#x70)|(%|&#37;)3A/
-      mattr_accessor :sanitized_protocol_separator, :instance_writer => false
-
-      # Specifies a Set of HTML attributes that can have URIs.
-      @@sanitized_uri_attributes = Set.new(%w(href src cite action longdesc xlink:href lowsrc))
-      mattr_reader :sanitized_uri_attributes
-
-      # Specifies a Set of 'bad' tags that the #sanitize helper will remove completely, as opposed
-      # to just escaping harmless tags like &lt;font&gt;
-      @@sanitized_bad_tags = Set.new(%w(script))
-      mattr_reader :sanitized_bad_tags
-
-      # Specifies the default Set of tags that the #sanitize helper will allow unscathed.
-      @@sanitized_allowed_tags = Set.new(%w(strong em b i p code pre tt output samp kbd var sub 
-        sup dfn cite big small address hr br div span h1 h2 h3 h4 h5 h6 ul ol li dt dd abbr 
-        acronym a img blockquote del ins fieldset legend))
-      mattr_reader :sanitized_allowed_tags
-
-      # Specifies the default Set of html attributes that the #sanitize helper will leave 
-      # in the allowed tag.
-      @@sanitized_allowed_attributes = Set.new(%w(href src width height alt cite datetime title class name xml:lang abbr))
-      mattr_reader :sanitized_allowed_attributes
-
-      # Specifies the default Set of acceptable css properties that #sanitize and #sanitize_css will accept.
-      @@sanitized_allowed_css_properties = Set.new(%w(azimuth background-color border-bottom-color border-collapse 
-        border-color border-left-color border-right-color border-top-color clear color cursor direction display 
-        elevation float font font-family font-size font-style font-variant font-weight height letter-spacing line-height
-        overflow pause pause-after pause-before pitch pitch-range richness speak speak-header speak-numeral speak-punctuation
-        speech-rate stress text-align text-decoration text-indent unicode-bidi vertical-align voice-family volume white-space
-        width))
-      mattr_reader :sanitized_allowed_css_properties
-
-      # Specifies the default Set of acceptable css keywords that #sanitize and #sanitize_css will accept.
-      @@sanitized_allowed_css_keywords = Set.new(%w(auto aqua black block blue bold both bottom brown center
-        collapse dashed dotted fuchsia gray green !important italic left lime maroon medium none navy normal
-        nowrap olive pointer purple red right solid silver teal top transparent underline white yellow))
-      mattr_reader :sanitized_allowed_css_keywords
-
-      # Specifies the default Set of allowed shorthand css properties for the #sanitize and #sanitize_css helpers.
-      @@sanitized_shorthand_css_properties = Set.new(%w(background border margin padding))
-      mattr_reader :sanitized_shorthand_css_properties
-
-      # Specifies the default Set of protocols that the #sanitize helper will leave in
-      # protocol attributes.
-      @@sanitized_allowed_protocols = Set.new(%w(ed2k ftp http https irc mailto news gopher nntp telnet webcal xmpp callto feed svn urn aim rsync tag ssh sftp rtsp afs))
-      mattr_reader :sanitized_allowed_protocols
-
       module ClassMethods #:nodoc:
         def self.extended(base)
           class << base
+            attr_writer :full_sanitizer, :link_sanitizer, :white_list_sanitizer
+
             # we want these to be class methods on ActionView::Base, they'll get mattr_readers for these below.
-            [:sanitized_protocol_separator, :sanitized_uri_attributes, :sanitized_bad_tags, :sanitized_allowed_tags,
+            helper_def = [:sanitized_protocol_separator, :sanitized_uri_attributes, :sanitized_bad_tags, :sanitized_allowed_tags,
                 :sanitized_allowed_attributes, :sanitized_allowed_css_properties, :sanitized_allowed_css_keywords,
-                :sanitized_shorthand_css_properties, :sanitized_allowed_protocols, :sanitized_protocol_separator=].each do |prop|
-              delegate prop, :to => SanitizeHelper
-            end
+                :sanitized_shorthand_css_properties, :sanitized_allowed_protocols, :sanitized_protocol_separator=].collect! do |prop|
+              prop = prop.to_s
+              "def #{prop}(#{:value if prop =~ /=$/}) white_list_sanitizer.#{prop.sub /sanitized_/, ''} #{:value if prop =~ /=$/} end"
+            end.join("\n")
+            eval helper_def
           end
         end
+        
+        # Gets the HTML::FullSanitizer instance used by strip_tags.  Replace with
+        # any object that responds to #sanitize
+        #
+        #   Rails::Initializer.run do |config|
+        #     config.action_view.full_sanitizer = MySpecialSanitizer.new
+        #   end
+        #
+        def full_sanitizer
+          @full_sanitizer ||= HTML::FullSanitizer.new
+        end
+
+        # Gets the HTML::LinkSanitizer instance used by strip_links.  Replace with
+        # any object that responds to #sanitize
+        #
+        #   Rails::Initializer.run do |config|
+        #     config.action_view.link_sanitizer = MySpecialSanitizer.new
+        #   end
+        #
+        def link_sanitizer
+          @link_sanitizer ||= HTML::LinkSanitizer.new
+        end
+
+        # Gets the HTML::WhiteListSanitizer instance used by sanitize and sanitize_css.
+        # Replace with any object that responds to #sanitize
+        #
+        #   Rails::Initializer.run do |config|
+        #     config.action_view.white_list_sanitizer = MySpecialSanitizer.new
+        #   end
+        #
+        def white_list_sanitizer
+          @white_list_sanitizer ||= HTML::WhiteListSanitizer.new
+        end
 
         # Adds valid HTML attributes that the #sanitize helper checks for URIs.
         #
@@ -242,7 +146,7 @@ module ActionView
         #   end
         #
         def sanitized_uri_attributes=(attributes)
-          Helpers::SanitizeHelper.sanitized_uri_attributes.merge(attributes)
+          HTML::WhiteListSanitizer.uri_attributes.merge(attributes)
         end
 
         # Adds to the Set of 'bad' tags for the #sanitize helper.
@@ -252,7 +156,7 @@ module ActionView
         #   end
         #
         def sanitized_bad_tags=(attributes)
-          Helpers::SanitizeHelper.sanitized_bad_tags.merge(attributes)
+          HTML::WhiteListSanitizer.bad_tags.merge(attributes)
         end
         # Adds to the Set of allowed tags for the #sanitize helper.
         #
@@ -261,7 +165,7 @@ module ActionView
         #   end
         #
         def sanitized_allowed_tags=(attributes)
-          Helpers::SanitizeHelper.sanitized_allowed_tags.merge(attributes)
+          HTML::WhiteListSanitizer.allowed_tags.merge(attributes)
         end
 
         # Adds to the Set of allowed html attributes for the #sanitize helper.
@@ -271,7 +175,7 @@ module ActionView
         #   end
         #
         def sanitized_allowed_attributes=(attributes)
-          Helpers::SanitizeHelper.sanitized_allowed_attributes.merge(attributes)
+          HTML::WhiteListSanitizer.allowed_attributes.merge(attributes)
         end
 
         # Adds to the Set of allowed css properties for the #sanitize and #sanitize_css heleprs.
@@ -281,7 +185,7 @@ module ActionView
         #   end
         #
         def sanitized_allowed_css_properties=(attributes)
-          Helpers::SanitizeHelper.sanitized_allowed_css_properties.merge(attributes)
+          HTML::WhiteListSanitizer.allowed_css_properties.merge(attributes)
         end
 
         # Adds to the Set of allowed css keywords for the #sanitize and #sanitize_css helpers.
@@ -291,7 +195,7 @@ module ActionView
         #   end
         #
         def sanitized_allowed_css_keywords=(attributes)
-          Helpers::SanitizeHelper.sanitized_allowed_css_keywords.merge(attributes)
+          HTML::WhiteListSanitizer.allowed_css_keywords.merge(attributes)
         end
 
         # Adds to the Set of allowed shorthand css properties for the #sanitize and #sanitize_css helpers.
@@ -301,7 +205,7 @@ module ActionView
         #   end
         #
         def sanitized_shorthand_css_properties=(attributes)
-          Helpers::SanitizeHelper.sanitized_shorthand_css_properties.merge(attributes)
+          HTML::WhiteListSanitizer.shorthand_css_properties.merge(attributes)
         end
 
         # Adds to the Set of allowed protocols for the #sanitize helper.
@@ -311,15 +215,9 @@ module ActionView
         #   end
         #
         def sanitized_allowed_protocols=(attributes)
-          Helpers::SanitizeHelper.sanitized_allowed_protocols.merge(attributes)
+          HTML::WhiteListSanitizer.allowed_protocols.merge(attributes)
         end
       end
-
-      private
-        def contains_bad_protocols?(attr_name, value)
-          sanitized_uri_attributes.include?(attr_name) && 
-          (value =~ /(^[^\/:]*):|(&#0*58)|(&#x70)|(%|&#37;)3A/ && !sanitized_allowed_protocols.include?(value.split(sanitized_protocol_separator).first))
-        end
     end
   end
 end