1 files changed, 133 insertions, 84 deletions
diff --git a/library/HTMLPurifier/Lexer/DirectLex.php b/library/HTMLPurifier/Lexer/DirectLex.php
index 456e6e190..746b6e315 100644
--- a/library/HTMLPurifier/Lexer/DirectLex.php
+++ b/library/HTMLPurifier/Lexer/DirectLex.php
@@ -12,30 +12,44 @@
  */
 class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
 {
-
+    /**
+     * @type bool
+     */
     public $tracksLineNumbers = true;
 
     /**
      * Whitespace characters for str(c)spn.
+     * @type string
      */
     protected $_whitespace = "\x20\x09\x0D\x0A";
 
     /**
      * Callback function for script CDATA fudge
-     * @param $matches, in form of array(opening tag, contents, closing tag)
+     * @param array $matches, in form of array(opening tag, contents, closing tag)
+     * @return string
      */
-    protected function scriptCallback($matches) {
+    protected function scriptCallback($matches)
+    {
         return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3];
     }
 
-    public function tokenizeHTML($html, $config, $context) {
-
+    /**
+     * @param String $html
+     * @param HTMLPurifier_Config $config
+     * @param HTMLPurifier_Context $context
+     * @return array|HTMLPurifier_Token[]
+     */
+    public function tokenizeHTML($html, $config, $context)
+    {
         // special normalization for script tags without any armor
         // our "armor" heurstic is a < sign any number of whitespaces after
         // the first script tag
         if ($config->get('HTML.Trusted')) {
-            $html = preg_replace_callback('#(<script[^>]*>)(\s*[^<].+?)(</script>)#si',
-                array($this, 'scriptCallback'), $html);
+            $html = preg_replace_callback(
+                '#(<script[^>]*>)(\s*[^<].+?)(</script>)#si',
+                array($this, 'scriptCallback'),
+                $html
+            );
         }
 
         $html = $this->normalize($html, $config, $context);
@@ -55,15 +69,15 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
 
         if ($maintain_line_numbers) {
             $current_line = 1;
-            $current_col  = 0;
+            $current_col = 0;
             $length = strlen($html);
         } else {
             $current_line = false;
-            $current_col  = false;
+            $current_col = false;
             $length = false;
         }
         $context->register('CurrentLine', $current_line);
-        $context->register('CurrentCol',  $current_col);
+        $context->register('CurrentCol', $current_col);
         $nl = "\n";
         // how often to manually recalculate. This will ALWAYS be right,
         // but it's pretty wasteful. Set to 0 to turn off
@@ -77,16 +91,14 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
         // for testing synchronization
         $loops = 0;
 
-        while(++$loops) {
-
+        while (++$loops) {
             // $cursor is either at the start of a token, or inside of
             // a tag (i.e. there was a < immediately before it), as indicated
             // by $inside_tag
 
             if ($maintain_line_numbers) {
-
                 // $rcursor, however, is always at the start of a token.
-                $rcursor = $cursor - (int) $inside_tag;
+                $rcursor = $cursor - (int)$inside_tag;
 
                 // Column number is cheap, so we calculate it every round.
                 // We're interested at the *end* of the newline string, so
@@ -96,14 +108,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                 $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1);
 
                 // recalculate lines
-                if (
-                    $synchronize_interval &&  // synchronization is on
-                    $cursor > 0 &&            // cursor is further than zero
-                    $loops % $synchronize_interval === 0 // time to synchronize!
-                ) {
+                if ($synchronize_interval && // synchronization is on
+                    $cursor > 0 && // cursor is further than zero
+                    $loops % $synchronize_interval === 0) { // time to synchronize!
                     $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);
                 }
-
             }
 
             $position_next_lt = strpos($html, '<', $cursor);
@@ -119,35 +128,42 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
             if (!$inside_tag && $position_next_lt !== false) {
                 // We are not inside tag and there still is another tag to parse
                 $token = new
-                    HTMLPurifier_Token_Text(
-                        $this->parseData(
-                            substr(
-                                $html, $cursor, $position_next_lt - $cursor
-                            )
+                HTMLPurifier_Token_Text(
+                    $this->parseData(
+                        substr(
+                            $html,
+                            $cursor,
+                            $position_next_lt - $cursor
                         )
-                    );
+                    )
+                );
                 if ($maintain_line_numbers) {
                     $token->rawPosition($current_line, $current_col);
                     $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);
                 }
                 $array[] = $token;
-                $cursor  = $position_next_lt + 1;
+                $cursor = $position_next_lt + 1;
                 $inside_tag = true;
                 continue;
             } elseif (!$inside_tag) {
                 // We are not inside tag but there are no more tags
                 // If we're already at the end, break
-                if ($cursor === strlen($html)) break;
+                if ($cursor === strlen($html)) {
+                    break;
+                }
                 // Create Text of rest of string
                 $token = new
-                    HTMLPurifier_Token_Text(
-                        $this->parseData(
-                            substr(
-                                $html, $cursor
-                            )
+                HTMLPurifier_Token_Text(
+                    $this->parseData(
+                        substr(
+                            $html,
+                            $cursor
                         )
-                    );
-                if ($maintain_line_numbers) $token->rawPosition($current_line, $current_col);
+                    )
+                );
+                if ($maintain_line_numbers) {
+                    $token->rawPosition($current_line, $current_col);
+                }
                 $array[] = $token;
                 break;
             } elseif ($inside_tag && $position_next_gt !== false) {
@@ -171,16 +187,16 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                 }
 
                 // Check if it's a comment
-                if (
-                    substr($segment, 0, 3) === '!--'
-                ) {
+                if (substr($segment, 0, 3) === '!--') {
                     // re-determine segment length, looking for -->
                     $position_comment_end = strpos($html, '-->', $cursor);
                     if ($position_comment_end === false) {
                         // uh oh, we have a comment that extends to
                         // infinity. Can't be helped: set comment
                         // end position to end of string
-                        if ($e) $e->send(E_WARNING, 'Lexer: Unclosed comment');
+                        if ($e) {
+                            $e->send(E_WARNING, 'Lexer: Unclosed comment');
+                        }
                         $position_comment_end = strlen($html);
                         $end = true;
                     } else {
@@ -189,11 +205,13 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                     $strlen_segment = $position_comment_end - $cursor;
                     $segment = substr($html, $cursor, $strlen_segment);
                     $token = new
-                        HTMLPurifier_Token_Comment(
-                            substr(
-                                $segment, 3, $strlen_segment - 3
-                            )
-                        );
+                    HTMLPurifier_Token_Comment(
+                        substr(
+                            $segment,
+                            3,
+                            $strlen_segment - 3
+                        )
+                    );
                     if ($maintain_line_numbers) {
                         $token->rawPosition($current_line, $current_col);
                         $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
@@ -205,7 +223,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                 }
 
                 // Check if it's an end tag
-                $is_end_tag = (strpos($segment,'/') === 0);
+                $is_end_tag = (strpos($segment, '/') === 0);
                 if ($is_end_tag) {
                     $type = substr($segment, 1);
                     $token = new HTMLPurifier_Token_End($type);
@@ -224,7 +242,9 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                 // text and go our merry way
                 if (!ctype_alpha($segment[0])) {
                     // XML:  $segment[0] !== '_' && $segment[0] !== ':'
-                    if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt');
+                    if ($e) {
+                        $e->send(E_NOTICE, 'Lexer: Unescaped lt');
+                    }
                     $token = new HTMLPurifier_Token_Text('<');
                     if ($maintain_line_numbers) {
                         $token->rawPosition($current_line, $current_col);
@@ -239,7 +259,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                 // trailing slash. Remember, we could have a tag like <br>, so
                 // any later token processing scripts must convert improperly
                 // classified EmptyTags from StartTags.
-                $is_self_closing = (strrpos($segment,'/') === $strlen_segment-1);
+                $is_self_closing = (strrpos($segment, '/') === $strlen_segment - 1);
                 if ($is_self_closing) {
                     $strlen_segment--;
                     $segment = substr($segment, 0, $strlen_segment);
@@ -269,14 +289,16 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                 $attribute_string =
                     trim(
                         substr(
-                            $segment, $position_first_space
+                            $segment,
+                            $position_first_space
                         )
                     );
                 if ($attribute_string) {
                     $attr = $this->parseAttributeString(
-                                    $attribute_string
-                                  , $config, $context
-                              );
+                        $attribute_string,
+                        $config,
+                        $context
+                    );
                 } else {
                     $attr = array();
                 }
@@ -296,15 +318,19 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                 continue;
             } else {
                 // inside tag, but there's no ending > sign
-                if ($e) $e->send(E_WARNING, 'Lexer: Missing gt');
+                if ($e) {
+                    $e->send(E_WARNING, 'Lexer: Missing gt');
+                }
                 $token = new
-                    HTMLPurifier_Token_Text(
-                        '<' .
-                        $this->parseData(
-                            substr($html, $cursor)
-                        )
-                    );
-                if ($maintain_line_numbers) $token->rawPosition($current_line, $current_col);
+                HTMLPurifier_Token_Text(
+                    '<' .
+                    $this->parseData(
+                        substr($html, $cursor)
+                    )
+                );
+                if ($maintain_line_numbers) {
+                    $token->rawPosition($current_line, $current_col);
+                }
                 // no cursor scroll? Hmm...
                 $array[] = $token;
                 break;
@@ -319,8 +345,14 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
 
     /**
      * PHP 5.0.x compatible substr_count that implements offset and length
+     * @param string $haystack
+     * @param string $needle
+     * @param int $offset
+     * @param int $length
+     * @return int
      */
-    protected function substrCount($haystack, $needle, $offset, $length) {
+    protected function substrCount($haystack, $needle, $offset, $length)
+    {
         static $oldVersion;
         if ($oldVersion === null) {
             $oldVersion = version_compare(PHP_VERSION, '5.1', '<');
@@ -336,13 +368,18 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
     /**
      * Takes the inside of an HTML tag and makes an assoc array of attributes.
      *
-     * @param $string Inside of tag excluding name.
-     * @returns Assoc array of attributes.
+     * @param string $string Inside of tag excluding name.
+     * @param HTMLPurifier_Config $config
+     * @param HTMLPurifier_Context $context
+     * @return array Assoc array of attributes.
      */
-    public function parseAttributeString($string, $config, $context) {
-        $string = (string) $string; // quick typecast
+    public function parseAttributeString($string, $config, $context)
+    {
+        $string = (string)$string; // quick typecast
 
-        if ($string == '') return array(); // no attributes
+        if ($string == '') {
+            return array();
+        } // no attributes
 
         $e = false;
         if ($config->get('Core.CollectErrors')) {
@@ -361,46 +398,55 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
             list($key, $quoted_value) = explode('=', $string);
             $quoted_value = trim($quoted_value);
             if (!$key) {
-                if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
+                if ($e) {
+                    $e->send(E_ERROR, 'Lexer: Missing attribute key');
+                }
                 return array();
             }
-            if (!$quoted_value) return array($key => '');
+            if (!$quoted_value) {
+                return array($key => '');
+            }
             $first_char = @$quoted_value[0];
-            $last_char  = @$quoted_value[strlen($quoted_value)-1];
+            $last_char = @$quoted_value[strlen($quoted_value) - 1];
 
             $same_quote = ($first_char == $last_char);
             $open_quote = ($first_char == '"' || $first_char == "'");
 
-            if ( $same_quote && $open_quote) {
+            if ($same_quote && $open_quote) {
                 // well behaved
                 $value = substr($quoted_value, 1, strlen($quoted_value) - 2);
             } else {
                 // not well behaved
                 if ($open_quote) {
-                    if ($e) $e->send(E_ERROR, 'Lexer: Missing end quote');
+                    if ($e) {
+                        $e->send(E_ERROR, 'Lexer: Missing end quote');
+                    }
                     $value = substr($quoted_value, 1);
                 } else {
                     $value = $quoted_value;
                 }
             }
-            if ($value === false) $value = '';
+            if ($value === false) {
+                $value = '';
+            }
             return array($key => $this->parseData($value));
         }
 
         // setup loop environment
-        $array  = array(); // return assoc array of attributes
+        $array = array(); // return assoc array of attributes
         $cursor = 0; // current position in string (moves forward)
-        $size   = strlen($string); // size of the string (stays the same)
+        $size = strlen($string); // size of the string (stays the same)
 
         // if we have unquoted attributes, the parser expects a terminating
         // space, so let's guarantee that there's always a terminating space.
         $string .= ' ';
 
-        while(true) {
-
-            if ($cursor >= $size) {
-                break;
+        $old_cursor = -1;
+        while ($cursor < $size) {
+            if ($old_cursor >= $cursor) {
+                throw new Exception("Infinite loop detected");
             }
+            $old_cursor = $cursor;
 
             $cursor += ($value = strspn($string, $this->_whitespace, $cursor));
             // grab the key
@@ -415,8 +461,10 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
             $key = substr($string, $key_begin, $key_end - $key_begin);
 
             if (!$key) {
-                if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
-                $cursor += strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop
+                if ($e) {
+                    $e->send(E_ERROR, 'Lexer: Missing attribute key');
+                }
+                $cursor += 1 + strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop
                 continue; // empty key
             }
 
@@ -467,24 +515,25 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                 }
 
                 $value = substr($string, $value_begin, $value_end - $value_begin);
-                if ($value === false) $value = '';
+                if ($value === false) {
+                    $value = '';
+                }
                 $array[$key] = $this->parseData($value);
                 $cursor++;
-
             } else {
                 // boolattr
                 if ($key !== '') {
                     $array[$key] = $key;
                 } else {
                     // purely theoretical
-                    if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key');
+                    if ($e) {
+                        $e->send(E_ERROR, 'Lexer: Missing attribute key');
+                    }
                 }
-
             }
         }
         return $array;
     }
-
 }
 
 // vim: et sw=4 sts=4