From ba83aa7f03d2742dd242a45229e0f67785871515 Mon Sep 17 00:00:00 2001 From: Piotr Sarnacki Date: Sat, 9 Jun 2012 14:40:50 +0200 Subject: Move action_controller/vendor/html-scanner to action_view This is another step in moving Action View's dependencies in Action Pack to Action View itself. Also, HtmlScanner seems to be better suited for views rather than controllers. --- .../vendor/html-scanner/html/document.rb | 68 -- .../vendor/html-scanner/html/node.rb | 532 ------------- .../vendor/html-scanner/html/sanitizer.rb | 188 ----- .../vendor/html-scanner/html/selector.rb | 830 --------------------- .../vendor/html-scanner/html/tokenizer.rb | 107 --- .../vendor/html-scanner/html/version.rb | 11 - 6 files changed, 1736 deletions(-) delete mode 100644 actionpack/lib/action_controller/vendor/html-scanner/html/document.rb delete mode 100644 actionpack/lib/action_controller/vendor/html-scanner/html/node.rb delete mode 100644 actionpack/lib/action_controller/vendor/html-scanner/html/sanitizer.rb delete mode 100644 actionpack/lib/action_controller/vendor/html-scanner/html/selector.rb delete mode 100644 actionpack/lib/action_controller/vendor/html-scanner/html/tokenizer.rb delete mode 100644 actionpack/lib/action_controller/vendor/html-scanner/html/version.rb (limited to 'actionpack/lib/action_controller/vendor/html-scanner') diff --git a/actionpack/lib/action_controller/vendor/html-scanner/html/document.rb b/actionpack/lib/action_controller/vendor/html-scanner/html/document.rb deleted file mode 100644 index 386820300a..0000000000 --- a/actionpack/lib/action_controller/vendor/html-scanner/html/document.rb +++ /dev/null @@ -1,68 +0,0 @@ -require 'html/tokenizer' -require 'html/node' -require 'html/selector' -require 'html/sanitizer' - -module HTML #:nodoc: - # A top-level HTML document. You give it a body of text, and it will parse that - # text into a tree of nodes. - class Document #:nodoc: - - # The root of the parsed document. - attr_reader :root - - # Create a new Document from the given text. - def initialize(text, strict=false, xml=false) - tokenizer = Tokenizer.new(text) - @root = Node.new(nil) - node_stack = [ @root ] - while token = tokenizer.next - node = Node.parse(node_stack.last, tokenizer.line, tokenizer.position, token, strict) - - node_stack.last.children << node unless node.tag? && node.closing == :close - if node.tag? - if node_stack.length > 1 && node.closing == :close - if node_stack.last.name == node.name - if node_stack.last.children.empty? - node_stack.last.children << Text.new(node_stack.last, node.line, node.position, "") - end - node_stack.pop - else - open_start = node_stack.last.position - 20 - open_start = 0 if open_start < 0 - close_start = node.position - 20 - close_start = 0 if close_start < 0 - msg = < hash } unless Hash === hash - hash = keys_to_symbols(hash) - hash.each do |k,v| - case k - when :tag, :content then - # keys are valid, and require no further processing - when :attributes then - hash[k] = keys_to_strings(v) - when :parent, :child, :ancestor, :descendant, :sibling, :before, - :after - hash[k] = Conditions.new(v) - when :children - hash[k] = v = keys_to_symbols(v) - v.each do |key,value| - case key - when :count, :greater_than, :less_than - # keys are valid, and require no further processing - when :only - v[key] = Conditions.new(value) - else - raise "illegal key #{key.inspect} => #{value.inspect}" - end - end - else - raise "illegal key #{k.inspect} => #{v.inspect}" - end - end - update hash - end - - private - - def keys_to_strings(hash) - Hash[hash.keys.map {|k| [k.to_s, hash[k]]}] - end - - def keys_to_symbols(hash) - Hash[hash.keys.map do |k| - raise "illegal key #{k.inspect}" unless k.respond_to?(:to_sym) - [k.to_sym, hash[k]] - end] - end - end - - # The base class of all nodes, textual and otherwise, in an HTML document. - class Node #:nodoc: - # The array of children of this node. Not all nodes have children. - attr_reader :children - - # The parent node of this node. All nodes have a parent, except for the - # root node. - attr_reader :parent - - # The line number of the input where this node was begun - attr_reader :line - - # The byte position in the input where this node was begun - attr_reader :position - - # Create a new node as a child of the given parent. - def initialize(parent, line=0, pos=0) - @parent = parent - @children = [] - @line, @position = line, pos - end - - # Return a textual representation of the node. - def to_s - @children.join() - end - - # Return false (subclasses must override this to provide specific matching - # behavior.) +conditions+ may be of any type. - def match(conditions) - false - end - - # Search the children of this node for the first node for which #find - # returns non +nil+. Returns the result of the #find call that succeeded. - def find(conditions) - conditions = validate_conditions(conditions) - @children.each do |child| - node = child.find(conditions) - return node if node - end - nil - end - - # Search for all nodes that match the given conditions, and return them - # as an array. - def find_all(conditions) - conditions = validate_conditions(conditions) - - matches = [] - matches << self if match(conditions) - @children.each do |child| - matches.concat child.find_all(conditions) - end - matches - end - - # Returns +false+. Subclasses may override this if they define a kind of - # tag. - def tag? - false - end - - def validate_conditions(conditions) - Conditions === conditions ? conditions : Conditions.new(conditions) - end - - def ==(node) - return false unless self.class == node.class && children.size == node.children.size - - equivalent = true - - children.size.times do |i| - equivalent &&= children[i] == node.children[i] - end - - equivalent - end - - class </) - if strict - raise "expected ]]> (got #{scanner.rest.inspect} for #{content})" - else - scanner.skip_until(/\Z/) - end - end - - return CDATA.new(parent, line, pos, scanner.pre_match.gsub(/\/]+/) - name.downcase! - - unless closing - scanner.skip(/\s*/) - attributes = {} - while attr = scanner.scan(/[-\w:]+/) - value = true - if scanner.scan(/\s*=\s*/) - if delim = scanner.scan(/['"]/) - value = "" - while text = scanner.scan(/[^#{delim}\\]+|./) - case text - when "\\" then - value << text - break if scanner.eos? - value << scanner.getch - when delim - break - else value << text - end - end - else - value = scanner.scan(/[^\s>\/]+/) - end - end - attributes[attr.downcase] = value - scanner.skip(/\s*/) - end - - closing = ( scanner.scan(/\//) ? :self : nil ) - end - - unless scanner.scan(/\s*>/) - if strict - raise "expected > (got #{scanner.rest.inspect} for #{content}, #{attributes.inspect})" - else - # throw away all text until we find what we're looking for - scanner.skip_until(/>/) or scanner.terminate - end - end - - Tag.new(parent, line, pos, name, attributes, closing) - end - end - end - end - - # A node that represents text, rather than markup. - class Text < Node #:nodoc: - - attr_reader :content - - # Creates a new text node as a child of the given parent, with the given - # content. - def initialize(parent, line, pos, content) - super(parent, line, pos) - @content = content - end - - # Returns the content of this node. - def to_s - @content - end - - # Returns +self+ if this node meets the given conditions. Text nodes support - # conditions of the following kinds: - # - # * if +conditions+ is a string, it must be a substring of the node's - # content - # * if +conditions+ is a regular expression, it must match the node's - # content - # * if +conditions+ is a hash, it must contain a :content key that - # is either a string or a regexp, and which is interpreted as described - # above. - def find(conditions) - match(conditions) && self - end - - # Returns non-+nil+ if this node meets the given conditions, or +nil+ - # otherwise. See the discussion of #find for the valid conditions. - def match(conditions) - case conditions - when String - @content == conditions - when Regexp - @content =~ conditions - when Hash - conditions = validate_conditions(conditions) - - # Text nodes only have :content, :parent, :ancestor - unless (conditions.keys - [:content, :parent, :ancestor]).empty? - return false - end - - match(conditions[:content]) - else - nil - end - end - - def ==(node) - return false unless super - content == node.content - end - end - - # A CDATA node is simply a text node with a specialized way of displaying - # itself. - class CDATA < Text #:nodoc: - def to_s - "" - end - end - - # A Tag is any node that represents markup. It may be an opening tag, a - # closing tag, or a self-closing tag. It has a name, and may have a hash of - # attributes. - class Tag < Node #:nodoc: - - # Either +nil+, :close, or :self - attr_reader :closing - - # Either +nil+, or a hash of attributes for this node. - attr_reader :attributes - - # The name of this tag. - attr_reader :name - - # Create a new node as a child of the given parent, using the given content - # to describe the node. It will be parsed and the node name, attributes and - # closing status extracted. - def initialize(parent, line, pos, name, attributes, closing) - super(parent, line, pos) - @name = name - @attributes = attributes - @closing = closing - end - - # A convenience for obtaining an attribute of the node. Returns +nil+ if - # the node has no attributes. - def [](attr) - @attributes ? @attributes[attr] : nil - end - - # Returns non-+nil+ if this tag can contain child nodes. - def childless?(xml = false) - return false if xml && @closing.nil? - !@closing.nil? || - @name =~ /^(img|br|hr|link|meta|area|base|basefont| - col|frame|input|isindex|param)$/ox - end - - # Returns a textual representation of the node - def to_s - if @closing == :close - "" - else - s = "<#{@name}" - @attributes.each do |k,v| - s << " #{k}" - s << "=\"#{v}\"" if String === v - end - s << " /" if @closing == :self - s << ">" - @children.each { |child| s << child.to_s } - s << "" if @closing != :self && !@children.empty? - s - end - end - - # If either the node or any of its children meet the given conditions, the - # matching node is returned. Otherwise, +nil+ is returned. (See the - # description of the valid conditions in the +match+ method.) - def find(conditions) - match(conditions) && self || super - end - - # Returns +true+, indicating that this node represents an HTML tag. - def tag? - true - end - - # Returns +true+ if the node meets any of the given conditions. The - # +conditions+ parameter must be a hash of any of the following keys - # (all are optional): - # - # * :tag: the node name must match the corresponding value - # * :attributes: a hash. The node's values must match the - # corresponding values in the hash. - # * :parent: a hash. The node's parent must match the - # corresponding hash. - # * :child: a hash. At least one of the node's immediate children - # must meet the criteria described by the hash. - # * :ancestor: a hash. At least one of the node's ancestors must - # meet the criteria described by the hash. - # * :descendant: a hash. At least one of the node's descendants - # must meet the criteria described by the hash. - # * :sibling: a hash. At least one of the node's siblings must - # meet the criteria described by the hash. - # * :after: a hash. The node must be after any sibling meeting - # the criteria described by the hash, and at least one sibling must match. - # * :before: a hash. The node must be before any sibling meeting - # the criteria described by the hash, and at least one sibling must match. - # * :children: a hash, for counting children of a node. Accepts the - # keys: - # ** :count: either a number or a range which must equal (or - # include) the number of children that match. - # ** :less_than: the number of matching children must be less than - # this number. - # ** :greater_than: the number of matching children must be - # greater than this number. - # ** :only: another hash consisting of the keys to use - # to match on the children, and only matching children will be - # counted. - # - # Conditions are matched using the following algorithm: - # - # * if the condition is a string, it must be a substring of the value. - # * if the condition is a regexp, it must match the value. - # * if the condition is a number, the value must match number.to_s. - # * if the condition is +true+, the value must not be +nil+. - # * if the condition is +false+ or +nil+, the value must be +nil+. - # - # Usage: - # - # # test if the node is a "span" tag - # node.match :tag => "span" - # - # # test if the node's parent is a "div" - # node.match :parent => { :tag => "div" } - # - # # test if any of the node's ancestors are "table" tags - # node.match :ancestor => { :tag => "table" } - # - # # test if any of the node's immediate children are "em" tags - # node.match :child => { :tag => "em" } - # - # # test if any of the node's descendants are "strong" tags - # node.match :descendant => { :tag => "strong" } - # - # # test if the node has between 2 and 4 span tags as immediate children - # node.match :children => { :count => 2..4, :only => { :tag => "span" } } - # - # # get funky: test to see if the node is a "div", has a "ul" ancestor - # # and an "li" parent (with "class" = "enum"), and whether or not it has - # # a "span" descendant that contains # text matching /hello world/: - # node.match :tag => "div", - # :ancestor => { :tag => "ul" }, - # :parent => { :tag => "li", - # :attributes => { :class => "enum" } }, - # :descendant => { :tag => "span", - # :child => /hello world/ } - def match(conditions) - conditions = validate_conditions(conditions) - # check content of child nodes - if conditions[:content] - if children.empty? - return false unless match_condition("", conditions[:content]) - else - return false unless children.find { |child| child.match(conditions[:content]) } - end - end - - # test the name - return false unless match_condition(@name, conditions[:tag]) if conditions[:tag] - - # test attributes - (conditions[:attributes] || {}).each do |key, value| - return false unless match_condition(self[key], value) - end - - # test parent - return false unless parent.match(conditions[:parent]) if conditions[:parent] - - # test children - return false unless children.find { |child| child.match(conditions[:child]) } if conditions[:child] - - # test ancestors - if conditions[:ancestor] - return false unless catch :found do - p = self - throw :found, true if p.match(conditions[:ancestor]) while p = p.parent - end - end - - # test descendants - if conditions[:descendant] - return false unless children.find do |child| - # test the child - child.match(conditions[:descendant]) || - # test the child's descendants - child.match(:descendant => conditions[:descendant]) - end - end - - # count children - if opts = conditions[:children] - matches = children.select do |c| - (c.kind_of?(HTML::Tag) and (c.closing == :self or ! c.childless?)) - end - - matches = matches.select { |c| c.match(opts[:only]) } if opts[:only] - opts.each do |key, value| - next if key == :only - case key - when :count - if Integer === value - return false if matches.length != value - else - return false unless value.include?(matches.length) - end - when :less_than - return false unless matches.length < value - when :greater_than - return false unless matches.length > value - else raise "unknown count condition #{key}" - end - end - end - - # test siblings - if conditions[:sibling] || conditions[:before] || conditions[:after] - siblings = parent ? parent.children : [] - self_index = siblings.index(self) - - if conditions[:sibling] - return false unless siblings.detect do |s| - s != self && s.match(conditions[:sibling]) - end - end - - if conditions[:before] - return false unless siblings[self_index+1..-1].detect do |s| - s != self && s.match(conditions[:before]) - end - end - - if conditions[:after] - return false unless siblings[0,self_index].detect do |s| - s != self && s.match(conditions[:after]) - end - end - end - - true - end - - def ==(node) - return false unless super - return false unless closing == node.closing && self.name == node.name - attributes == node.attributes - end - - private - # Match the given value to the given condition. - def match_condition(value, condition) - case condition - when String - value && value == condition - when Regexp - value && value.match(condition) - when Numeric - value == condition.to_s - when true - !value.nil? - when false, nil - value.nil? - else - false - end - end - end -end diff --git a/actionpack/lib/action_controller/vendor/html-scanner/html/sanitizer.rb b/actionpack/lib/action_controller/vendor/html-scanner/html/sanitizer.rb deleted file mode 100644 index 6b4ececda2..0000000000 --- a/actionpack/lib/action_controller/vendor/html-scanner/html/sanitizer.rb +++ /dev/null @@ -1,188 +0,0 @@ -require 'set' -require 'cgi' -require 'active_support/core_ext/class/attribute_accessors' - -module HTML - class Sanitizer - def sanitize(text, options = {}) - validate_options(options) - return text unless sanitizeable?(text) - tokenize(text, options).join - end - - def sanitizeable?(text) - !(text.nil? || text.empty? || !text.index("<")) - end - - protected - def tokenize(text, options) - tokenizer = HTML::Tokenizer.new(text) - result = [] - while token = tokenizer.next - node = Node.parse(nil, 0, 0, token, false) - process_node node, result, options - end - result - end - - def process_node(node, result, options) - result << node.to_s - end - - def validate_options(options) - if options[:tags] && !options[:tags].is_a?(Enumerable) - raise ArgumentError, "You should pass :tags as an Enumerable" - end - - if options[:attributes] && !options[:attributes].is_a?(Enumerable) - raise ArgumentError, "You should pass :attributes as an Enumerable" - end - end - end - - class FullSanitizer < Sanitizer - def sanitize(text, options = {}) - result = super - # strip any comments, and if they have a newline at the end (ie. line with - # only a comment) strip that too - result = result.gsub(/[\n]?/m, "") if (result && result =~ /[\n]?/m) - # Recurse - handle all dirty nested tags - result == text ? result : sanitize(result, options) - end - - def process_node(node, result, options) - result << node.to_s if node.class == HTML::Text - end - end - - class LinkSanitizer < FullSanitizer - cattr_accessor :included_tags, :instance_writer => false - self.included_tags = Set.new(%w(a href)) - - def sanitizeable?(text) - !(text.nil? || text.empty? || !((text.index(""))) - end - - protected - def process_node(node, result, options) - result << node.to_s unless node.is_a?(HTML::Tag) && included_tags.include?(node.name) - end - end - - class WhiteListSanitizer < Sanitizer - [:protocol_separator, :uri_attributes, :allowed_attributes, :allowed_tags, :allowed_protocols, :bad_tags, - :allowed_css_properties, :allowed_css_keywords, :shorthand_css_properties].each do |attr| - class_attribute attr, :instance_writer => false - end - - # A regular expression of the valid characters used to separate protocols like - # the ':' in 'http://foo.com' - self.protocol_separator = /:|(�*58)|(p)|(%|%)3A/ - - # Specifies a Set of HTML attributes that can have URIs. - self.uri_attributes = Set.new(%w(href src cite action longdesc xlink:href lowsrc)) - - # Specifies a Set of 'bad' tags that the #sanitize helper will remove completely, as opposed - # to just escaping harmless tags like <font> - self.bad_tags = Set.new(%w(script)) - - # Specifies the default Set of tags that the #sanitize helper will allow unscathed. - self.allowed_tags = Set.new(%w(strong em b i p code pre tt samp kbd var sub - sup dfn cite big small address hr br div span h1 h2 h3 h4 h5 h6 ul ol li dl dt dd abbr - acronym a img blockquote del ins)) - - # Specifies the default Set of html attributes that the #sanitize helper will leave - # in the allowed tag. - self.allowed_attributes = Set.new(%w(href src width height alt cite datetime title class name xml:lang abbr)) - - # Specifies the default Set of acceptable css properties that #sanitize and #sanitize_css will accept. - self.allowed_protocols = Set.new(%w(ed2k ftp http https irc mailto news gopher nntp telnet webcal xmpp callto - feed svn urn aim rsync tag ssh sftp rtsp afs)) - - # Specifies the default Set of acceptable css properties that #sanitize and #sanitize_css will accept. - self.allowed_css_properties = Set.new(%w(azimuth background-color border-bottom-color border-collapse - border-color border-left-color border-right-color border-top-color clear color cursor direction display - elevation float font font-family font-size font-style font-variant font-weight height letter-spacing line-height - overflow pause pause-after pause-before pitch pitch-range richness speak speak-header speak-numeral speak-punctuation - speech-rate stress text-align text-decoration text-indent unicode-bidi vertical-align voice-family volume white-space - width)) - - # Specifies the default Set of acceptable css keywords that #sanitize and #sanitize_css will accept. - self.allowed_css_keywords = Set.new(%w(auto aqua black block blue bold both bottom brown center - collapse dashed dotted fuchsia gray green !important italic left lime maroon medium none navy normal - nowrap olive pointer purple red right solid silver teal top transparent underline white yellow)) - - # Specifies the default Set of allowed shorthand css properties for the #sanitize and #sanitize_css helpers. - self.shorthand_css_properties = Set.new(%w(background border margin padding)) - - # Sanitizes a block of css code. Used by #sanitize when it comes across a style attribute - def sanitize_css(style) - # disallow urls - style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ') - - # gauntlet - if style !~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/ || - style !~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$)\s*)*$/ - return '' - end - - clean = [] - style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop,val| - if allowed_css_properties.include?(prop.downcase) - clean << prop + ': ' + val + ';' - elsif shorthand_css_properties.include?(prop.split('-')[0].downcase) - unless val.split().any? do |keyword| - !allowed_css_keywords.include?(keyword) && - keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/ - end - clean << prop + ': ' + val + ';' - end - end - end - clean.join(' ') - end - - protected - def tokenize(text, options) - options[:parent] = [] - options[:attributes] ||= allowed_attributes - options[:tags] ||= allowed_tags - super - end - - def process_node(node, result, options) - result << case node - when HTML::Tag - if node.closing == :close - options[:parent].shift - else - options[:parent].unshift node.name - end - - process_attributes_for node, options - - options[:tags].include?(node.name) ? node : nil - else - bad_tags.include?(options[:parent].first) ? nil : node.to_s.gsub(//login. - # - # === Matching Elements - # - # Use the #match method to determine if an element matches the selector. - # - # For simple selectors, the method returns an array with that element, - # or +nil+ if the element does not match. For complex selectors (see below) - # the method returns an array with all matched elements, of +nil+ if no - # match found. - # - # For example: - # if selector.match(element) - # puts "Element is a login form" - # end - # - # === Selecting Elements - # - # Use the #select method to select all matching elements starting with - # one element and going through all children in depth-first order. - # - # This method returns an array of all matching elements, an empty array - # if no match is found - # - # For example: - # selector = HTML::Selector.new "input[type=text]" - # matches = selector.select(element) - # matches.each do |match| - # puts "Found text field with name #{match.attributes['name']}" - # end - # - # === Expressions - # - # Selectors can match elements using any of the following criteria: - # * name -- Match an element based on its name (tag name). - # For example, p to match a paragraph. You can use * - # to match any element. - # * #id -- Match an element based on its identifier (the - # id attribute). For example, #page. - # * .class -- Match an element based on its class name, all - # class names if more than one specified. - # * [attr] -- Match an element that has the specified attribute. - # * [attr=value] -- Match an element that has the specified - # attribute and value. (More operators are supported see below) - # * :pseudo-class -- Match an element based on a pseudo class, - # such as :nth-child and :empty. - # * :not(expr) -- Match an element that does not match the - # negation expression. - # - # When using a combination of the above, the element name comes first - # followed by identifier, class names, attributes, pseudo classes and - # negation in any order. Do not separate these parts with spaces! - # Space separation is used for descendant selectors. - # - # For example: - # selector = HTML::Selector.new "form.login[action=/login]" - # The matched element must be of type +form+ and have the class +login+. - # It may have other classes, but the class +login+ is required to match. - # It must also have an attribute called +action+ with the value - # /login. - # - # This selector will match the following element: - #