From 7a40f4354b32809af3d0cfd6e3af0eda02ab0e0a Mon Sep 17 00:00:00 2001 From: friendica Date: Sat, 12 May 2012 17:57:41 -0700 Subject: some important stuff we'll need --- lib/htmlpurifier/tests/HTMLPurifier/LexerTest.php | 752 ++++++++++++++++++++++ 1 file changed, 752 insertions(+) create mode 100644 lib/htmlpurifier/tests/HTMLPurifier/LexerTest.php (limited to 'lib/htmlpurifier/tests/HTMLPurifier/LexerTest.php') diff --git a/lib/htmlpurifier/tests/HTMLPurifier/LexerTest.php b/lib/htmlpurifier/tests/HTMLPurifier/LexerTest.php new file mode 100644 index 000000000..42a59aeb6 --- /dev/null +++ b/lib/htmlpurifier/tests/HTMLPurifier/LexerTest.php @@ -0,0 +1,752 @@ +config->set('Core.MaintainLineNumbers', true); + $lexer = HTMLPurifier_Lexer::create($this->config); + $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex'); + } + + function test_create_objectLexerImpl() { + $this->config->set('Core.LexerImpl', new HTMLPurifier_Lexer_DirectLex()); + $lexer = HTMLPurifier_Lexer::create($this->config); + $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex'); + } + + function test_create_unknownLexer() { + $this->config->set('Core.LexerImpl', 'AsdfAsdf'); + $this->expectException(new HTMLPurifier_Exception('Cannot instantiate unrecognized Lexer type AsdfAsdf')); + HTMLPurifier_Lexer::create($this->config); + } + + function test_create_incompatibleLexer() { + $this->config->set('Core.LexerImpl', 'DOMLex'); + $this->config->set('Core.MaintainLineNumbers', true); + $this->expectException(new HTMLPurifier_Exception('Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)')); + HTMLPurifier_Lexer::create($this->config); + } + + // HTMLPurifier_Lexer->parseData() ----------------------------------------- + + function assertParseData($input, $expect = true) { + if ($expect === true) $expect = $input; + $lexer = new HTMLPurifier_Lexer(); + $this->assertIdentical($expect, $lexer->parseData($input)); + } + + function test_parseData_plainText() { + $this->assertParseData('asdf'); + } + + function test_parseData_ampersandEntity() { + $this->assertParseData('&', '&'); + } + + function test_parseData_quotEntity() { + $this->assertParseData('"', '"'); + } + + function test_parseData_aposNumericEntity() { + $this->assertParseData(''', "'"); + } + + function test_parseData_aposCompactNumericEntity() { + $this->assertParseData(''', "'"); + } + + function test_parseData_adjacentAmpersandEntities() { + $this->assertParseData('&&&', '&&&'); + } + + function test_parseData_trailingUnescapedAmpersand() { + $this->assertParseData('&&', '&&'); + } + + function test_parseData_internalUnescapedAmpersand() { + $this->assertParseData('Procter & Gamble'); + } + + function test_parseData_improperEntityFaultToleranceTest() { + $this->assertParseData('-'); + } + + // HTMLPurifier_Lexer->extractBody() --------------------------------------- + + function assertExtractBody($text, $extract = true) { + $lexer = new HTMLPurifier_Lexer(); + $result = $lexer->extractBody($text); + if ($extract === true) $extract = $text; + $this->assertIdentical($extract, $result); + } + + function test_extractBody_noBodyTags() { + $this->assertExtractBody('Bold'); + } + + function test_extractBody_lowercaseBodyTags() { + $this->assertExtractBody('Bold', 'Bold'); + } + + function test_extractBody_uppercaseBodyTags() { + $this->assertExtractBody('Bold', 'Bold'); + } + + function test_extractBody_realisticUseCase() { + $this->assertExtractBody( +' + + + xyz + + +
+
+ + + +
+
+ +', + ' +
+
+ + + +
+
+ '); + } + + function test_extractBody_bodyWithAttributes() { + $this->assertExtractBody('Bold', 'Bold'); + } + + function test_extractBody_preserveUnclosedBody() { + $this->assertExtractBody('asdf'); // not closed, don't accept + } + + function test_extractBody_useLastBody() { + $this->assertExtractBody('foobar', 'foobar'); + } + + // HTMLPurifier_Lexer->tokenizeHTML() -------------------------------------- + + function assertTokenization($input, $expect, $alt_expect = array()) { + $lexers = array(); + $lexers['DirectLex'] = new HTMLPurifier_Lexer_DirectLex(); + if (class_exists('DOMDocument')) { + $lexers['DOMLex'] = new HTMLPurifier_Lexer_DOMLex(); + $lexers['PH5P'] = new HTMLPurifier_Lexer_PH5P(); + } + foreach ($lexers as $name => $lexer) { + $result = $lexer->tokenizeHTML($input, $this->config, $this->context); + if (isset($alt_expect[$name])) { + if ($alt_expect[$name] === false) continue; + $t_expect = $alt_expect[$name]; + $this->assertIdentical($result, $alt_expect[$name], "$name: %s"); + } else { + $t_expect = $expect; + $this->assertIdentical($result, $expect, "$name: %s"); + } + if ($t_expect != $result) { + printTokens($result); + } + } + } + + function test_tokenizeHTML_emptyInput() { + $this->assertTokenization('', array()); + } + + function test_tokenizeHTML_plainText() { + $this->assertTokenization( + 'This is regular text.', + array( + new HTMLPurifier_Token_Text('This is regular text.') + ) + ); + } + + function test_tokenizeHTML_textAndTags() { + $this->assertTokenization( + 'This is bold text', + array( + new HTMLPurifier_Token_Text('This is '), + new HTMLPurifier_Token_Start('b', array()), + new HTMLPurifier_Token_Text('bold'), + new HTMLPurifier_Token_End('b'), + new HTMLPurifier_Token_Text(' text'), + ) + ); + } + + function test_tokenizeHTML_normalizeCase() { + $this->assertTokenization( + '
Totally rad dude. asdf
', + array( + new HTMLPurifier_Token_Start('DIV', array()), + new HTMLPurifier_Token_Text('Totally rad dude. '), + new HTMLPurifier_Token_Start('b', array()), + new HTMLPurifier_Token_Text('asdf'), + new HTMLPurifier_Token_End('b'), + new HTMLPurifier_Token_End('div'), + ) + ); + } + + function test_tokenizeHTML_notWellFormed() { + $this->assertTokenization( + '', + array( + new HTMLPurifier_Token_Start('asdf'), + new HTMLPurifier_Token_End('asdf'), + new HTMLPurifier_Token_Start('d'), + new HTMLPurifier_Token_End('d'), + new HTMLPurifier_Token_Start('poOloka'), + new HTMLPurifier_Token_Start('poolasdf'), + new HTMLPurifier_Token_Start('ds'), + new HTMLPurifier_Token_End('asdf'), + new HTMLPurifier_Token_End('ASDF'), + ), + array( + 'DOMLex' => $alt = array( + new HTMLPurifier_Token_Empty('asdf'), + new HTMLPurifier_Token_Empty('d'), + new HTMLPurifier_Token_Start('pooloka'), + new HTMLPurifier_Token_Start('poolasdf'), + new HTMLPurifier_Token_Empty('ds'), + new HTMLPurifier_Token_End('poolasdf'), + new HTMLPurifier_Token_End('pooloka'), + ), + 'PH5P' => $alt, + ) + ); + } + + function test_tokenizeHTML_whitespaceInTag() { + $this->assertTokenization( + 'Link to foobar', + array( + new HTMLPurifier_Token_Start('a',array('href'=>'foobar.php','title'=>'foo!')), + new HTMLPurifier_Token_Text('Link to '), + new HTMLPurifier_Token_Start('b',array('id'=>'asdf')), + new HTMLPurifier_Token_Text('foobar'), + new HTMLPurifier_Token_End('b'), + new HTMLPurifier_Token_End('a'), + ) + ); + } + + function test_tokenizeHTML_singleAttribute() { + $this->assertTokenization( + '
', + array( + new HTMLPurifier_Token_Empty('br', array('style' => '&')) + ) + ); + } + + function test_tokenizeHTML_emptyTag() { + $this->assertTokenization( + '
', + array( new HTMLPurifier_Token_Empty('br') ) + ); + } + + function test_tokenizeHTML_comment() { + $this->assertTokenization( + '', + array( new HTMLPurifier_Token_Comment(' Comment ') ) + ); + } + + function test_tokenizeHTML_malformedComment() { + $this->assertTokenization( + '', + array( new HTMLPurifier_Token_Comment(' not so well formed -') ) + ); + } + + function test_tokenizeHTML_unterminatedTag() { + $this->assertTokenization( + ' array( new HTMLPurifier_Token_Empty('a', array('href'=>'')) ), + 'PH5P' => false, // total barfing, grabs scaffolding too + ) + ); + } + + function test_tokenizeHTML_specialEntities() { + $this->assertTokenization( + '<b>', + array( + new HTMLPurifier_Token_Text('') + ), + array( + // some parsers will separate entities out + 'PH5P' => array( + new HTMLPurifier_Token_Text('<'), + new HTMLPurifier_Token_Text('b'), + new HTMLPurifier_Token_Text('>'), + ), + ) + ); + } + + function test_tokenizeHTML_earlyQuote() { + $this->assertTokenization( + '', + array( new HTMLPurifier_Token_Empty('a') ), + array( + // we barf on this input + 'DirectLex' => array( + new HTMLPurifier_Token_Start('a', array('"' => '')) + ), + 'PH5P' => false, // behavior varies; handle this personally + ) + ); + } + + function test_tokenizeHTML_earlyQuote_PH5P() { + if (!class_exists('DOMDocument')) return; + $lexer = new HTMLPurifier_Lexer_PH5P(); + $result = $lexer->tokenizeHTML('', $this->config, $this->context); + if ($this->context->get('PH5PError', true)) { + $this->assertIdentical(array( + new HTMLPurifier_Token_Start('a', array('"' => '')) + ), $result); + } else { + $this->assertIdentical(array( + new HTMLPurifier_Token_Empty('a', array('"' => '')) + ), $result); + } + } + + function test_tokenizeHTML_unescapedQuote() { + $this->assertTokenization( + '"', + array( new HTMLPurifier_Token_Text('"') ) + ); + } + + function test_tokenizeHTML_escapedQuote() { + $this->assertTokenization( + '"', + array( new HTMLPurifier_Token_Text('"') ) + ); + } + + function test_tokenizeHTML_cdata() { + $this->assertTokenization( + 'can't get me!]]>', + array( new HTMLPurifier_Token_Text('You can't get me!') ), + array( + 'PH5P' => array( + new HTMLPurifier_Token_Text('You '), + new HTMLPurifier_Token_Text('<'), + new HTMLPurifier_Token_Text('b'), + new HTMLPurifier_Token_Text('>'), + new HTMLPurifier_Token_Text('can'), + new HTMLPurifier_Token_Text('&'), + new HTMLPurifier_Token_Text('#39;t'), + new HTMLPurifier_Token_Text('<'), + new HTMLPurifier_Token_Text('/b'), + new HTMLPurifier_Token_Text('>'), + new HTMLPurifier_Token_Text(' get me!'), + ), + ) + ); + } + + function test_tokenizeHTML_characterEntity() { + $this->assertTokenization( + 'θ', + array( new HTMLPurifier_Token_Text("\xCE\xB8") ) + ); + } + + function test_tokenizeHTML_characterEntityInCDATA() { + $this->assertTokenization( + '', + array( new HTMLPurifier_Token_Text("→") ), + array( + 'PH5P' => array( + new HTMLPurifier_Token_Text('&'), + new HTMLPurifier_Token_Text('rarr;'), + ), + ) + ); + } + + function test_tokenizeHTML_entityInAttribute() { + $this->assertTokenization( + 'Link', + array( + new HTMLPurifier_Token_Start('a',array('href' => 'index.php?title=foo&id=bar')), + new HTMLPurifier_Token_Text('Link'), + new HTMLPurifier_Token_End('a'), + ) + ); + } + + function test_tokenizeHTML_preserveUTF8() { + $this->assertTokenization( + "\xCE\xB8", + array( new HTMLPurifier_Token_Text("\xCE\xB8") ) + ); + } + + function test_tokenizeHTML_specialEntityInAttribute() { + $this->assertTokenization( + '
', + array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) ) + ); + } + + function test_tokenizeHTML_emoticonProtection() { + $this->assertTokenization( + 'Whoa! <3 That\'s not good >.>', + array( + new HTMLPurifier_Token_Start('b'), + new HTMLPurifier_Token_Text('Whoa! '), + new HTMLPurifier_Token_Text('<'), + new HTMLPurifier_Token_Text('3 That\'s not good >.>'), + new HTMLPurifier_Token_End('b') + ), + array( + // text is absorbed together + 'DOMLex' => array( + new HTMLPurifier_Token_Start('b'), + new HTMLPurifier_Token_Text('Whoa! <3 That\'s not good >.>'), + new HTMLPurifier_Token_End('b'), + ), + 'PH5P' => array( // interesting grouping + new HTMLPurifier_Token_Start('b'), + new HTMLPurifier_Token_Text('Whoa! '), + new HTMLPurifier_Token_Text('<'), + new HTMLPurifier_Token_Text('3 That\'s not good >.>'), + new HTMLPurifier_Token_End('b'), + ), + ) + ); + } + + function test_tokenizeHTML_commentWithFunkyChars() { + $this->assertTokenization( + '
', + array( + new HTMLPurifier_Token_Comment(' This >< comment '), + new HTMLPurifier_Token_Empty('br'), + ) + ); + } + + function test_tokenizeHTML_unterminatedComment() { + $this->assertTokenization( + '', + array( new HTMLPurifier_Token_Comment(' This comment < < & ') ) + ); + } + + function test_tokenizeHTML_attributeWithSpecialCharacters() { + $this->assertTokenization( + '', + array( new HTMLPurifier_Token_Empty('a', array('href' => '><>')) ), + array( + 'DirectLex' => array( + new HTMLPurifier_Token_Start('a', array('href' => '')), + new HTMLPurifier_Token_Text('<'), + new HTMLPurifier_Token_Text('">'), + ) + ) + ); + } + + function test_tokenizeHTML_emptyTagWithSlashInAttribute() { + $this->assertTokenization( + '', + array( new HTMLPurifier_Token_Empty('param', array('name' => 'src', 'value' => 'http://example.com/video.wmv')) ) + ); + } + + function test_tokenizeHTML_style() { + $extra = array( + // PH5P doesn't seem to like style tags + 'PH5P' => false, + // DirectLex defers to RemoveForeignElements for textification + 'DirectLex' => array( + new HTMLPurifier_Token_Start('style', array('type' => 'text/css')), + new HTMLPurifier_Token_Comment("\ndiv {}\n"), + new HTMLPurifier_Token_End('style'), + ), + ); + if (!defined('LIBXML_VERSION')) { + // LIBXML_VERSION is missing in early versions of PHP + // prior to 1.30 of php-src/ext/libxml/libxml.c (version-wise, + // this translates to 5.0.x. In such cases, punt the test entirely. + return; + } elseif (LIBXML_VERSION < 20628) { + // libxml's behavior is wrong prior to this version, so make + // appropriate accomodations + $extra['DOMLex'] = $extra['DirectLex']; + } + $this->assertTokenization( +'', + array( + new HTMLPurifier_Token_Start('style', array('type' => 'text/css')), + new HTMLPurifier_Token_Text("\ndiv {}\n"), + new HTMLPurifier_Token_End('style'), + ), + $extra + ); + } + + function test_tokenizeHTML_tagWithAtSignAndExtraGt() { + $alt_expect = array( + // Technically this is invalid, but it won't be a + // problem with invalid element removal; also, this + // mimics Mozilla's parsing of the tag. + new HTMLPurifier_Token_Start('a@'), + new HTMLPurifier_Token_Text('>'), + ); + $this->assertTokenization( + '>', + array( + new HTMLPurifier_Token_Start('a'), + new HTMLPurifier_Token_Text('>'), + new HTMLPurifier_Token_End('a'), + ), + array( + 'DirectLex' => $alt_expect, + ) + ); + } + + function test_tokenizeHTML_emoticonHeart() { + $this->assertTokenization( + '
<3
', + array( + new HTMLPurifier_Token_Empty('br'), + new HTMLPurifier_Token_Text('<'), + new HTMLPurifier_Token_Text('3'), + new HTMLPurifier_Token_Empty('br'), + ), + array( + 'DOMLex' => array( + new HTMLPurifier_Token_Empty('br'), + new HTMLPurifier_Token_Text('<3'), + new HTMLPurifier_Token_Empty('br'), + ), + ) + ); + } + + function test_tokenizeHTML_emoticonShiftyEyes() { + $this->assertTokenization( + '<<', + array( + new HTMLPurifier_Token_Start('b'), + new HTMLPurifier_Token_Text('<'), + new HTMLPurifier_Token_Text('<'), + new HTMLPurifier_Token_End('b'), + ), + array( + 'DOMLex' => array( + new HTMLPurifier_Token_Start('b'), + new HTMLPurifier_Token_Text('<<'), + new HTMLPurifier_Token_End('b'), + ), + ) + ); + } + + function test_tokenizeHTML_eon1996() { + $this->assertTokenization( + '< test', + array( + new HTMLPurifier_Token_Text('<'), + new HTMLPurifier_Token_Text(' '), + new HTMLPurifier_Token_Start('b'), + new HTMLPurifier_Token_Text('test'), + new HTMLPurifier_Token_End('b'), + ), + array( + 'DOMLex' => array( + new HTMLPurifier_Token_Text('< '), + new HTMLPurifier_Token_Start('b'), + new HTMLPurifier_Token_Text('test'), + new HTMLPurifier_Token_End('b'), + ), + ) + ); + } + + function test_tokenizeHTML_bodyInCDATA() { + $alt_tokens = array( + new HTMLPurifier_Token_Text('<'), + new HTMLPurifier_Token_Text('body'), + new HTMLPurifier_Token_Text('>'), + new HTMLPurifier_Token_Text('Foo'), + new HTMLPurifier_Token_Text('<'), + new HTMLPurifier_Token_Text('/body'), + new HTMLPurifier_Token_Text('>'), + ); + $this->assertTokenization( + 'Foo]]>', + array( + new HTMLPurifier_Token_Text('Foo'), + ), + array( + 'PH5P' => $alt_tokens, + ) + ); + } + + function test_tokenizeHTML_() { + $this->assertTokenization( + '
', + array( + new HTMLPurifier_Token_Start('a'), + new HTMLPurifier_Token_Empty('img'), + new HTMLPurifier_Token_End('a'), + ) + ); + } + + function test_tokenizeHTML_ignoreIECondComment() { + $this->assertTokenization( + '', + array() + ); + } + + function test_tokenizeHTML_removeProcessingInstruction() { + $this->config->set('Core.RemoveProcessingInstructions', true); + $this->assertTokenization( + '', + array() + ); + } + + function test_tokenizeHTML_removeNewline() { + $this->config->set('Core.NormalizeNewlines', true); + $this->assertTokenization( + "plain\rtext\r\n", + array( + new HTMLPurifier_Token_Text("plain\ntext\n") + ) + ); + } + + function test_tokenizeHTML_noRemoveNewline() { + $this->config->set('Core.NormalizeNewlines', false); + $this->assertTokenization( + "plain\rtext\r\n", + array( + new HTMLPurifier_Token_Text("plain\rtext\r\n") + ) + ); + } + + function test_tokenizeHTML_conditionalCommentUngreedy() { + $this->assertTokenization( + 'b', + array( + new HTMLPurifier_Token_Text("b") + ) + ); + } + + function test_tokenizeHTML_imgTag() { + $start = array( + new HTMLPurifier_Token_Start('img', + array( + 'src' => 'img_11775.jpg', + 'alt' => '[Img #11775]', + 'id' => 'EMBEDDED_IMG_11775', + ) + ) + ); + $this->assertTokenization( + '[Img #11775]', + array( + new HTMLPurifier_Token_Empty('img', + array( + 'src' => 'img_11775.jpg', + 'alt' => '[Img #11775]', + 'id' => 'EMBEDDED_IMG_11775', + ) + ) + ), + array( + 'DirectLex' => $start, + ) + ); + } + + + /* + + function test_tokenizeHTML_() { + $this->assertTokenization( + , + array( + + ) + ); + } + */ + +} + +// vim: et sw=4 sts=4 -- cgit v1.2.3