aboutsummaryrefslogtreecommitdiffstats
path: root/actionpack/lib/action_controller/vendor/html-scanner/html
diff options
context:
space:
mode:
authorDavid Heinemeier Hansson <david@loudthinking.com>2005-04-17 16:43:48 +0000
committerDavid Heinemeier Hansson <david@loudthinking.com>2005-04-17 16:43:48 +0000
commit806cf6d76ab557ceebf5ac5d22f3e39076e1ab61 (patch)
tree5ca7149e3f8ee758cb3b0fd2df096baf61824153 /actionpack/lib/action_controller/vendor/html-scanner/html
parent95e6c03de7cd52a426560a6c46bd08b1791d7022 (diff)
downloadrails-806cf6d76ab557ceebf5ac5d22f3e39076e1ab61.tar.gz
rails-806cf6d76ab557ceebf5ac5d22f3e39076e1ab61.tar.bz2
rails-806cf6d76ab557ceebf5ac5d22f3e39076e1ab61.zip
Added assert_tag and assert_no_tag as a much improved alternative to the deprecated assert_template_xpath_match #1126 [Jamis Buck]
git-svn-id: http://svn-commit.rubyonrails.org/rails/trunk@1195 5ecf4fe2-1ee6-0310-87b1-e25e094e27de
Diffstat (limited to 'actionpack/lib/action_controller/vendor/html-scanner/html')
-rw-r--r--actionpack/lib/action_controller/vendor/html-scanner/html/document.rb63
-rw-r--r--actionpack/lib/action_controller/vendor/html-scanner/html/node.rb431
-rw-r--r--actionpack/lib/action_controller/vendor/html-scanner/html/tokenizer.rb95
-rw-r--r--actionpack/lib/action_controller/vendor/html-scanner/html/version.rb11
4 files changed, 600 insertions, 0 deletions
diff --git a/actionpack/lib/action_controller/vendor/html-scanner/html/document.rb b/actionpack/lib/action_controller/vendor/html-scanner/html/document.rb
new file mode 100644
index 0000000000..a196bdea44
--- /dev/null
+++ b/actionpack/lib/action_controller/vendor/html-scanner/html/document.rb
@@ -0,0 +1,63 @@
+require 'html/tokenizer'
+require 'html/node'
+
+module HTML
+
+ # A top-level HTMl document. You give it a body of text, and it will parse that
+ # text into a tree of nodes.
+ class Document
+
+ # The root of the parsed document.
+ attr_reader :root
+
+ # Create a new Document from the given text.
+ def initialize(text)
+ tokenizer = Tokenizer.new(text)
+ @root = Node.new(nil)
+ node_stack = [ @root ]
+ while token = tokenizer.next
+ node = Node.parse(node_stack.last, tokenizer.line, tokenizer.position, token)
+
+ node_stack.last.children << node unless node.tag? && node.closing == :close
+ if node.tag? && !node.childless?
+ if node_stack.length > 1 && node.closing == :close
+ if node_stack.last.name == node.name
+ node_stack.pop
+ else
+ open_start = node_stack.last.position - 20
+ open_start = 0 if open_start < 0
+ close_start = node.position - 20
+ close_start = 0 if close_start < 0
+ warn <<EOF.strip
+ignoring attempt to close #{node_stack.last.name} with #{node.name}
+ opened at byte #{node_stack.last.position}, line #{node_stack.last.line}
+ closed at byte #{node.position}, line #{node.line}
+ attributes at open: #{node_stack.last.attributes.inspect}
+ text around open: #{text[open_start,40].inspect}
+ text around close: #{text[close_start,40].inspect}
+EOF
+ end
+ elsif node.closing != :close
+ node_stack.push node
+ end
+ end
+ end
+ end
+
+ # Search the tree for (and return) the first node that matches the given
+ # conditions. The conditions are interpreted differently for different node
+ # types, see HTML::Text#find and HTML::Tag#find.
+ def find(conditions)
+ @root.find(conditions)
+ end
+
+ # Search the tree for (and return) all nodes that match the given
+ # conditions. The conditions are interpreted differently for different node
+ # types, see HTML::Text#find and HTML::Tag#find.
+ def find_all(conditions)
+ @root.find_all(conditions)
+ end
+
+ end
+
+end
diff --git a/actionpack/lib/action_controller/vendor/html-scanner/html/node.rb b/actionpack/lib/action_controller/vendor/html-scanner/html/node.rb
new file mode 100644
index 0000000000..018309a9f9
--- /dev/null
+++ b/actionpack/lib/action_controller/vendor/html-scanner/html/node.rb
@@ -0,0 +1,431 @@
+require 'strscan'
+
+module HTML
+
+ class Conditions < Hash
+ def initialize(hash)
+ super()
+ hash = { :content => hash } unless Hash === hash
+ hash = keys_to_symbols(hash)
+ hash.each do |k,v|
+ case k
+ when :tag, :content then
+ # keys are valid, and require no further processing
+ when :attributes then
+ hash[k] = keys_to_strings(v)
+ when :parent, :child, :ancestor, :descendant
+ hash[k] = Conditions.new(v)
+ when :children
+ hash[k] = v = keys_to_symbols(v)
+ v.each do |k,v2|
+ case k
+ when :count, :greater_than, :less_than
+ # keys are valid, and require no further processing
+ when :only
+ v[k] = Conditions.new(v2)
+ else
+ raise "illegal key #{k.inspect} => #{v2.inspect}"
+ end
+ end
+ else
+ raise "illegal key #{k.inspect} => #{v.inspect}"
+ end
+ end
+ update hash
+ end
+
+ private
+
+ def keys_to_strings(hash)
+ hash.keys.inject({}) do |h,k|
+ h[k.to_s] = hash[k]
+ h
+ end
+ end
+
+ def keys_to_symbols(hash)
+ hash.keys.inject({}) do |h,k|
+ raise "illegal key #{k.inspect}" unless k.respond_to?(:to_sym)
+ h[k.to_sym] = hash[k]
+ h
+ end
+ end
+ end
+
+ # The base class of all nodes, textual and otherwise, in an HTML document.
+ class Node
+ # The array of children of this node. Not all nodes have children.
+ attr_reader :children
+
+ # The parent node of this node. All nodes have a parent, except for the
+ # root node.
+ attr_reader :parent
+
+ # The line number of the input where this node was begun
+ attr_reader :line
+
+ # The byte position in the input where this node was begun
+ attr_reader :position
+
+ # Create a new node as a child of the given parent.
+ def initialize(parent, line=0, pos=0)
+ @parent = parent
+ @children = []
+ @line, @position = line, pos
+ end
+
+ # Return a textual representation of the node.
+ def to_s
+ s = ""
+ @children.each { |child| s << child.to_s }
+ s
+ end
+
+ # Return false (subclasses must override this to provide specific matching
+ # behavior.) +conditions+ may be of any type.
+ def match(conditions)
+ false
+ end
+
+ # Search the children of this node for the first node for which #find
+ # returns non +nil+. Returns the result of the #find call that succeeded.
+ def find(conditions)
+ @children.each do |child|
+ node = child.find(conditions)
+ return node if node
+ end
+ nil
+ end
+
+ # Search for all nodes that match the given conditions, and return them
+ # as an array.
+ def find_all(conditions)
+ matches = []
+ matches << self if match(conditions)
+ @children.each do |child|
+ matches.concat child.find_all(conditions)
+ end
+ matches
+ end
+
+ # Returns +false+. Subclasses may override this if they define a kind of
+ # tag.
+ def tag?
+ false
+ end
+
+ def validate_conditions(conditions)
+ Conditions === conditions ? conditions : Conditions.new(conditions)
+ end
+
+ class <<self
+ def parse(parent, line, pos, content)
+ if content !~ /^<\S/
+ Text.new(parent, line, pos, content)
+ else
+ scanner = StringScanner.new(content)
+ scanner.skip(/</) or raise "expected <"
+ closing = ( scanner.scan(/\//) ? :close : nil )
+ return Text.new(parent, line, pos, content) unless name = scanner.scan(/[\w:]+/)
+ name.downcase!
+
+ unless closing
+ scanner.skip(/\s*/)
+ attributes = {}
+ while attr = scanner.scan(/[-\w:]+/)
+ value = true
+ if scanner.scan(/\s*=\s*/)
+ if delim = scanner.scan(/['"]/)
+ value = ""
+ while text = scanner.scan(/[^#{delim}\\]+|./)
+ case text
+ when "\\" then
+ value << text
+ value << scanner.getch
+ when delim
+ break
+ else value << text
+ end
+ end
+ else
+ value = scanner.scan(/[^\s>\/]+/)
+ end
+ end
+ attributes[attr.downcase] = value
+ scanner.skip(/\s*/)
+ end
+
+ closing = ( scanner.scan(/\//) ? :self : nil )
+ end
+
+ scanner.scan(/\s*>/) or raise "expected > (got #{scanner.rest.inspect} for #{content}, #{attributes.inspect})"
+
+ Tag.new(parent, line, pos, name, attributes, closing)
+ end
+ end
+ end
+ end
+
+ # A node that represents text, rather than markup.
+ class Text < Node
+
+ attr_reader :content
+
+ # Creates a new text node as a child of the given parent, with the given
+ # content.
+ def initialize(parent, line, pos, content)
+ super(parent, line, pos)
+ @content = content
+ end
+
+ # Returns the content of this node.
+ def to_s
+ @content
+ end
+
+ # Returns +self+ if this node meets the given conditions. Text nodes support
+ # conditions of the following kinds:
+ #
+ # * if +conditions+ is a string, it must be a substring of the node's
+ # content
+ # * if +conditions+ is a regular expression, it must match the node's
+ # content
+ # * if +conditions+ is a hash, it must contain a <tt>:content</tt> key that
+ # is either a string or a regexp, and which is interpreted as described
+ # above.
+ def find(conditions)
+ match(conditions) && self
+ end
+
+ # Returns non-+nil+ if this node meets the given conditions, or +nil+
+ # otherwise. See the discussion of #find for the valid conditions.
+ def match(conditions)
+ case conditions
+ when String
+ @content.index(conditions)
+ when Regexp
+ @content =~ conditions
+ when Hash
+ conditions = validate_conditions(conditions)
+
+ # Text nodes only have :content, :parent, :ancestor
+ unless (conditions.keys - [:content, :parent, :ancestor]).empty?
+ return false
+ end
+
+ match(conditions[:content])
+ else
+ nil
+ end
+ end
+ end
+
+ # A Tag is any node that represents markup. It may be an opening tag, a
+ # closing tag, or a self-closing tag. It has a name, and may have a hash of
+ # attributes.
+ class Tag < Node
+
+ # Either +nil+, <tt>:close</tt>, or <tt>:self</tt>
+ attr_reader :closing
+
+ # Either +nil+, or a hash of attributes for this node.
+ attr_reader :attributes
+
+ # The name of this tag.
+ attr_reader :name
+
+ # Create a new node as a child of the given parent, using the given content
+ # to describe the node. It will be parsed and the node name, attributes and
+ # closing status extracted.
+ def initialize(parent, line, pos, name, attributes, closing)
+ super(parent, line, pos)
+ @name = name
+ @attributes = attributes
+ @closing = closing
+ end
+
+ # A convenience for obtaining an attribute of the node. Returns +nil+ if
+ # the node has no attributes.
+ def [](attr)
+ @attributes ? @attributes[attr] : nil
+ end
+
+ # Returns non-+nil+ if this tag can contain child nodes.
+ def childless?
+ @name =~ /^(img|br|hr|link|meta|area|base|basefont|col|frame|input|isindex|param)$/o
+ end
+
+ # Returns a textual representation of the node
+ def to_s
+ if @closing == :close
+ "</#{@name}>"
+ else
+ s = "<#{@name}"
+ @attributes.each { |k,v| s << " #{k}='#{v.gsub(/'/,"\\\\'")}'" }
+ s << " /" if @closing == :self
+ s << ">"
+ @children.each { |child| s << child.to_s }
+ s
+ end
+ end
+
+ # If either the node or any of its children meet the given conditions, the
+ # matching node is returned. Otherwise, +nil+ is returned. (See the
+ # description of the valid conditions in the +match+ method.)
+ def find(conditions)
+ match(conditions) && self || super
+ end
+
+ # Returns +true+, indicating that this node represents an HTML tag.
+ def tag?
+ true
+ end
+
+ # Returns +true+ if the node meets any of the given conditions. The
+ # +conditions+ parameter must be a hash of any of the following keys
+ # (all are optional):
+ #
+ # * <tt>:tag</tt>: the node name must match the corresponding value
+ # * <tt>:attributes</tt>: a hash. The node's values must match the
+ # corresponding values in the hash.
+ # * <tt>:parent</tt>: a hash. The node's parent must match the
+ # corresponding hash.
+ # * <tt>:child</tt>: a hash. At least one of the node's immediate children
+ # must meet the criteria described by the hash.
+ # * <tt>:ancestor</tt>: a hash. At least one of the node's ancestors must
+ # meet the criteria described by the hash.
+ # * <tt>:descendant</tt>: a hash. At least one of the node's descendants
+ # must meet the criteria described by the hash.
+ # * <tt>:children</tt>: a hash, for counting children of a node. Accepts the
+ # keys:
+ # ** <tt>:count</tt>: either a number or a range which must equal (or
+ # include) the number of children that match.
+ # ** <tt>:less_than</tt>: the number of matching children must be less than
+ # this number.
+ # ** <tt>:greater_than</tt>: the number of matching children must be
+ # greater than this number.
+ # ** <tt>:only</tt>: another hash consisting of the keys to use
+ # to match on the children, and only matching children will be
+ # counted.
+ #
+ # Conditions are matched using the following algorithm:
+ #
+ # * if the condition is a string, it must be a substring of the value.
+ # * if the condition is a regexp, it must match the value.
+ # * if the condition is a number, the value must match number.to_s.
+ # * if the condition is +true+, the value must not be +nil+.
+ # * if the condition is +false+ or +nil+, the value must be +nil+.
+ #
+ # Usage:
+ #
+ # # test if the node is a "span" tag
+ # node.match :tag => "span"
+ #
+ # # test if the node's parent is a "div"
+ # node.match :parent => { :tag => "div" }
+ #
+ # # test if any of the node's ancestors are "table" tags
+ # node.match :ancestor => { :tag => "table" }
+ #
+ # # test if any of the node's immediate children are "em" tags
+ # node.match :child => { :tag => "em" }
+ #
+ # # test if any of the node's descendants are "strong" tags
+ # node.match :descendant => { :tag => "strong" }
+ #
+ # # test if the node has between 2 and 4 span tags as immediate children
+ # node.match :children => { :count => 2..4, :only => { :tag => "span" } }
+ #
+ # # get funky: test to see if the node is a "div", has a "ul" ancestor
+ # # and an "li" parent (with "class" = "enum"), and whether or not it has
+ # # a "span" descendant that contains # text matching /hello world/:
+ # node.match :tag => "div",
+ # :ancestor => { :tag => "ul" },
+ # :parent => { :tag => "li",
+ # :attributes => { :class => "enum" } },
+ # :descendant => { :tag => "span",
+ # :child => /hello world/ }
+ def match(conditions)
+ conditions = validate_conditions(conditions)
+
+ # only Text nodes have content
+ return false if conditions[:content]
+
+ # test the name
+ return false unless match_condition(@name, conditions[:tag]) if conditions[:tag]
+
+ # test attributes
+ (conditions[:attributes] || {}).each do |key, value|
+ return false unless match_condition(self[key], value)
+ end
+
+ # test parent
+ return false unless parent.match(conditions[:parent]) if conditions[:parent]
+
+ # test children
+ return false unless children.find { |child| child.match(conditions[:child]) } if conditions[:child]
+
+ # test ancestors
+ if conditions[:ancestor]
+ return false unless catch :found do
+ p = self
+ throw :found, true if p.match(conditions[:ancestor]) while p = p.parent
+ end
+ end
+
+ # test descendants
+ if conditions[:descendant]
+ return false unless children.find do |child|
+ # test the child
+ child.match(conditions[:descendant]) ||
+ # test the child's descendants
+ child.match(:descendant => conditions[:descendant])
+ end
+ end
+
+ # count children
+ if opts = conditions[:children]
+ matches = children
+ matches = matches.select { |c| c.match(opts[:only]) } if opts[:only]
+ opts.each do |key, value|
+ next if key == :only
+ case key
+ when :count
+ if Integer === value
+ return false if matches.length != value
+ else
+ return false unless value.include?(matches.length)
+ end
+ when :less_than
+ return false unless matches.length < value
+ when :greater_than
+ return false unless matches.length > value
+ else raise "unknown count condition #{key}"
+ end
+ end
+ end
+
+ true
+ end
+
+ private
+
+ # Match the given value to the given condition.
+ def match_condition(value, condition)
+ case condition
+ when String
+ value && value.index(condition)
+ when Regexp
+ value && value.match(condition)
+ when Numeric
+ value == condition.to_s
+ when true
+ !value.nil?
+ when false, nil
+ value.nil?
+ else
+ false
+ end
+ end
+ end
+end
diff --git a/actionpack/lib/action_controller/vendor/html-scanner/html/tokenizer.rb b/actionpack/lib/action_controller/vendor/html-scanner/html/tokenizer.rb
new file mode 100644
index 0000000000..11bd48708e
--- /dev/null
+++ b/actionpack/lib/action_controller/vendor/html-scanner/html/tokenizer.rb
@@ -0,0 +1,95 @@
+require 'strscan'
+
+module HTML
+
+ # A simple HTML tokenizer. It simply breaks a stream of text into tokens, where each
+ # token is a string. Each string represents either "text", or an HTML element.
+ #
+ # This currently assumes valid XHTML, which means no free < or > characters.
+ #
+ # Usage:
+ #
+ # tokenizer = HTML::Tokenizer.new(text)
+ # while token = tokenizer.next
+ # p token
+ # end
+ class Tokenizer
+
+ # The current (byte) position in the text
+ attr_reader :position
+
+ # The current line number
+ attr_reader :line
+
+ # Create a new Tokenizer for the given text.
+ def initialize(text)
+ @scanner = StringScanner.new(text)
+ @position = 0
+ @line = 0
+ @current_line = 1
+ end
+
+ # Return the next token in the sequence, or +nil+ if there are no more tokens in
+ # the stream.
+ def next
+ return nil if @scanner.eos?
+ @position = @scanner.pos
+ @line = @current_line
+ if @scanner.check(/<\S/)
+ update_current_line(scan_tag)
+ else
+ update_current_line(scan_text)
+ end
+ end
+
+ private
+
+ # Treat the text at the current position as a tag, and scan it. Supports
+ # comments, doctype tags, and regular tags, and ignores less-than and
+ # greater-than characters within quoted strings.
+ def scan_tag
+ tag = @scanner.getch
+ if @scanner.scan(/!--/) # comment
+ tag << @scanner.matched
+ tag << @scanner.scan_until(/--\s*>/)
+ elsif @scanner.scan(/!/) # doctype
+ tag << @scanner.matched
+ tag << consume_quoted_regions
+ else
+ tag << consume_quoted_regions
+ end
+ tag
+ end
+
+ # Scan all text up to the next < character and return it.
+ def scan_text
+ @scanner.scan(/[^<]*/)
+ end
+
+ # Counts the number of newlines in the text and updates the current line
+ # accordingly.
+ def update_current_line(text)
+ @current_line += text.scan(/\r\n|\r|\n/).length
+ text
+ end
+
+ # Skips over quoted strings, so that less-than and greater-than characters
+ # within the strings are ignored.
+ def consume_quoted_regions
+ text = ""
+ loop do
+ match = @scanner.scan_until(/['">]/) or break
+ text << match
+ break if (delim = @scanner.matched) == ">"
+ # consume the conqued region
+ while match = @scanner.scan_until(/[\\#{delim}]/)
+ text << match
+ break if @scanner.matched == delim
+ text << @scanner.getch # skip the escaped character
+ end
+ end
+ text
+ end
+ end
+
+end \ No newline at end of file
diff --git a/actionpack/lib/action_controller/vendor/html-scanner/html/version.rb b/actionpack/lib/action_controller/vendor/html-scanner/html/version.rb
new file mode 100644
index 0000000000..90025fa57f
--- /dev/null
+++ b/actionpack/lib/action_controller/vendor/html-scanner/html/version.rb
@@ -0,0 +1,11 @@
+module HTML
+ module Version
+
+ MAJOR = 0
+ MINOR = 5
+ TINY = 0
+
+ STRING = [ MAJOR, MINOR, TINY ].join(".")
+
+ end
+end