diff options
author | Rick Olson <technoweenie@gmail.com> | 2007-11-26 03:45:54 +0000 |
---|---|---|
committer | Rick Olson <technoweenie@gmail.com> | 2007-11-26 03:45:54 +0000 |
commit | 1af084ecda66a8e1b4eb3a51a07ebca85bf2e419 (patch) | |
tree | 2eb68b7cf6357726feb52653f480c099568d5da3 /actionpack/lib/action_controller/vendor | |
parent | bd5ed651105663cb4bc5acd86ad8bdf48251d0fe (diff) | |
download | rails-1af084ecda66a8e1b4eb3a51a07ebca85bf2e419.tar.gz rails-1af084ecda66a8e1b4eb3a51a07ebca85bf2e419.tar.bz2 rails-1af084ecda66a8e1b4eb3a51a07ebca85bf2e419.zip |
Refactor sanitizer helpers into HTML classes and make it easy to swap them out with custom implementations. Closes #10129. [rick]
git-svn-id: http://svn-commit.rubyonrails.org/rails/trunk@8213 5ecf4fe2-1ee6-0310-87b1-e25e094e27de
Diffstat (limited to 'actionpack/lib/action_controller/vendor')
-rw-r--r-- | actionpack/lib/action_controller/vendor/html-scanner/html/document.rb | 1 | ||||
-rw-r--r-- | actionpack/lib/action_controller/vendor/html-scanner/html/sanitizer.rb | 173 |
2 files changed, 174 insertions, 0 deletions
diff --git a/actionpack/lib/action_controller/vendor/html-scanner/html/document.rb b/actionpack/lib/action_controller/vendor/html-scanner/html/document.rb index 329ab01560..607fd186b9 100644 --- a/actionpack/lib/action_controller/vendor/html-scanner/html/document.rb +++ b/actionpack/lib/action_controller/vendor/html-scanner/html/document.rb @@ -1,6 +1,7 @@ require 'html/tokenizer' require 'html/node' require 'html/selector' +require 'html/sanitizer' module HTML #:nodoc: # A top-level HTMl document. You give it a body of text, and it will parse that diff --git a/actionpack/lib/action_controller/vendor/html-scanner/html/sanitizer.rb b/actionpack/lib/action_controller/vendor/html-scanner/html/sanitizer.rb new file mode 100644 index 0000000000..377e81aead --- /dev/null +++ b/actionpack/lib/action_controller/vendor/html-scanner/html/sanitizer.rb @@ -0,0 +1,173 @@ +module HTML + class Sanitizer + def sanitize(text, options = {}) + return text unless sanitizeable?(text) + tokenize(text, options).join + end + + def sanitizeable?(text) + !(text.nil? || text.empty? || !text.index("<")) + end + + protected + def tokenize(text, options) + tokenizer = HTML::Tokenizer.new(text) + result = [] + while token = tokenizer.next + node = Node.parse(nil, 0, 0, token, false) + process_node node, result, options + end + result + end + + def process_node(node, result, options) + result << node.to_s + end + end + + class FullSanitizer < Sanitizer + def sanitize(text, options = {}) + result = super + # strip any comments, and if they have a newline at the end (ie. line with + # only a comment) strip that too + result.gsub!(/<!--(.*?)-->[\n]?/m, "") if result + # Recurse - handle all dirty nested tags + result == text ? result : sanitize(result, options) + end + + def process_node(node, result, options) + result << node.to_s if node.class == HTML::Text + end + end + + class LinkSanitizer < FullSanitizer + cattr_accessor :included_tags, :instance_writer => false + self.included_tags = Set.new(%w(a href)) + + def sanitizeable?(text) + !(text.nil? || text.empty? || !((text.index("<a") || text.index("<href")) && text.index(">"))) + end + + protected + def process_node(node, result, options) + result << node.to_s unless node.is_a?(HTML::Tag) && included_tags.include?(node.name) + end + end + + class WhiteListSanitizer < Sanitizer + [:protocol_separator, :uri_attributes, :allowed_attributes, :allowed_tags, :allowed_protocols, :bad_tags, + :allowed_css_properties, :allowed_css_keywords, :shorthand_css_properties].each do |attr| + class_inheritable_accessor attr, :instance_writer => false + end + + # A regular expression of the valid characters used to separate protocols like + # the ':' in 'http://foo.com' + self.protocol_separator = /:|(�*58)|(p)|(%|%)3A/ + + # Specifies a Set of HTML attributes that can have URIs. + self.uri_attributes = Set.new(%w(href src cite action longdesc xlink:href lowsrc)) + + # Specifies a Set of 'bad' tags that the #sanitize helper will remove completely, as opposed + # to just escaping harmless tags like <font> + self.bad_tags = Set.new(%w(script)) + + # Specifies the default Set of tags that the #sanitize helper will allow unscathed. + self.allowed_tags = Set.new(%w(strong em b i p code pre tt output samp kbd var sub + sup dfn cite big small address hr br div span h1 h2 h3 h4 h5 h6 ul ol li dt dd abbr + acronym a img blockquote del ins fieldset legend)) + + # Specifies the default Set of html attributes that the #sanitize helper will leave + # in the allowed tag. + self.allowed_attributes = Set.new(%w(href src width height alt cite datetime title class name xml:lang abbr)) + + # Specifies the default Set of acceptable css properties that #sanitize and #sanitize_css will accept. + self.allowed_protocols = Set.new(%w(ed2k ftp http https irc mailto news gopher nntp telnet webcal xmpp callto + feed svn urn aim rsync tag ssh sftp rtsp afs)) + + # Specifies the default Set of acceptable css keywords that #sanitize and #sanitize_css will accept. + self.allowed_css_properties = Set.new(%w(azimuth background-color border-bottom-color border-collapse + border-color border-left-color border-right-color border-top-color clear color cursor direction display + elevation float font font-family font-size font-style font-variant font-weight height letter-spacing line-height + overflow pause pause-after pause-before pitch pitch-range richness speak speak-header speak-numeral speak-punctuation + speech-rate stress text-align text-decoration text-indent unicode-bidi vertical-align voice-family volume white-space + width)) + + # Specifies the default Set of acceptable css keywords that #sanitize and #sanitize_css will accept. + self.allowed_css_keywords = Set.new(%w(auto aqua black block blue bold both bottom brown center + collapse dashed dotted fuchsia gray green !important italic left lime maroon medium none navy normal + nowrap olive pointer purple red right solid silver teal top transparent underline white yellow)) + + # Specifies the default Set of allowed shorthand css properties for the #sanitize and #sanitize_css helpers. + self.shorthand_css_properties = Set.new(%w(background border margin padding)) + + # Sanitizes a block of css code. Used by #sanitize when it comes across a style attribute + def sanitize_css(style) + # disallow urls + style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ') + + # gauntlet + if style !~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/ || + style !~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/ + return '' + end + + clean = [] + style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop,val| + if allowed_css_properties.include?(prop.downcase) + clean << prop + ': ' + val + ';' + elsif shorthand_css_properties.include?(prop.split('-')[0].downcase) + unless val.split().any? do |keyword| + !allowed_css_keywords.include?(keyword) && + keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/ + end + clean << prop + ': ' + val + ';' + end + end + end + clean.join(' ') + end + + protected + def tokenize(text, options) + options[:parent] = [] + options[:attributes] ||= allowed_attributes + options[:tags] ||= allowed_tags + super + end + + def process_node(node, result, options) + result << case node + when HTML::Tag + if node.closing == :close + options[:parent].shift + else + options[:parent].unshift node.name + end + + process_attributes_for node, options + + options[:tags].include?(node.name) ? node : nil + else + bad_tags.include?(options[:parent].first) ? nil : node.to_s.gsub(/</, "<") + end + end + + def process_attributes_for(node, options) + return unless node.attributes + node.attributes.keys.each do |attr_name| + value = node.attributes[attr_name].to_s + + if !options[:attributes].include?(attr_name) || contains_bad_protocols?(attr_name, value) + node.attributes.delete(attr_name) + else + node.attributes[attr_name] = attr_name == 'style' ? sanitize_css(value) : CGI::escapeHTML(value) + end + end + end + + def contains_bad_protocols?(attr_name, value) + uri_attributes.include?(attr_name) && + (value =~ /(^[^\/:]*):|(�*58)|(p)|(%|%)3A/ && !allowed_protocols.include?(value.split(protocol_separator).first)) + end + end +end
\ No newline at end of file |