require 'strscan'
module HTML
# A simple HTML tokenizer. It simply breaks a stream of text into tokens, where each
# token is a string. Each string represents either "text", or an HTML element.
#
# This currently assumes valid XHTML, which means no free < or > characters.
#
# Usage:
#
# tokenizer = HTML::Tokenizer.new(text)
# while token = tokenizer.next
# p token
# end
class Tokenizer
# The current (byte) position in the text
attr_reader :position
# The current line number
attr_reader :line
# Create a new Tokenizer for the given text.
def initialize(text)
@scanner = StringScanner.new(text)
@position = 0
@line = 0
@current_line = 1
end
# Return the next token in the sequence, or +nil+ if there are no more tokens in
# the stream.
def next
return nil if @scanner.eos?
@position = @scanner.pos
@line = @current_line
if @scanner.check(/<\S/)
update_current_line(scan_tag)
else
update_current_line(scan_text)
end
end
private
# Treat the text at the current position as a tag, and scan it. Supports
# comments, doctype tags, and regular tags, and ignores less-than and
# greater-than characters within quoted strings.
def scan_tag
tag = @scanner.getch
if @scanner.scan(/!--/) # comment
tag << @scanner.matched
tag << @scanner.scan_until(/--\s*>/)
elsif @scanner.scan(/!/) # doctype
tag << @scanner.matched
tag << consume_quoted_regions
else
tag << consume_quoted_regions
end
tag
end
# Scan all text up to the next < character and return it.
def scan_text
@scanner.scan(/[^<]*/)
end
# Counts the number of newlines in the text and updates the current line
# accordingly.
def update_current_line(text)
@current_line += text.scan(/\r\n|\r|\n/).length
text
end
# Skips over quoted strings, so that less-than and greater-than characters
# within the strings are ignored.
def consume_quoted_regions
text = ""
loop do
match = @scanner.scan_until(/['">]/) or break
text << match
break if (delim = @scanner.matched) == ">"
# consume the conqued region
while match = @scanner.scan_until(/[\\#{delim}]/)
text << match
break if @scanner.matched == delim
text << @scanner.getch # skip the escaped character
end
end
text
end
end
end