aboutsummaryrefslogtreecommitdiffstats
path: root/library/HTMLPurifier/Strategy/FixNesting.php
blob: f81802391bf64426a536335b310a81bdf07b849c (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
<?php

/**
 * Takes a well formed list of tokens and fixes their nesting.
 *
 * HTML elements dictate which elements are allowed to be their children,
 * for example, you can't have a p tag in a span tag.  Other elements have
 * much more rigorous definitions: tables, for instance, require a specific
 * order for their elements.  There are also constraints not expressible by
 * document type definitions, such as the chameleon nature of ins/del
 * tags and global child exclusions.
 *
 * The first major objective of this strategy is to iterate through all the
 * nodes (not tokens) of the list of tokens and determine whether or not
 * their children conform to the element's definition.  If they do not, the
 * child definition may optionally supply an amended list of elements that
 * is valid or require that the entire node be deleted (and the previous
 * node rescanned).
 *
 * The second objective is to ensure that explicitly excluded elements of
 * an element do not appear in its children.  Code that accomplishes this
 * task is pervasive through the strategy, though the two are distinct tasks
 * and could, theoretically, be seperated (although it's not recommended).
 *
 * @note Whether or not unrecognized children are silently dropped or
 *       translated into text depends on the child definitions.
 *
 * @todo Enable nodes to be bubbled out of the structure.
 */

class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
{

    public function execute($tokens, $config, $context) {
        //####################################################################//
        // Pre-processing

        // get a copy of the HTML definition
        $definition = $config->getHTMLDefinition();

        // insert implicit "parent" node, will be removed at end.
        // DEFINITION CALL
        $parent_name = $definition->info_parent;
        array_unshift($tokens, new HTMLPurifier_Token_Start($parent_name));
        $tokens[] = new HTMLPurifier_Token_End($parent_name);

        // setup the context variable 'IsInline', for chameleon processing
        // is 'false' when we are not inline, 'true' when it must always
        // be inline, and an integer when it is inline for a certain
        // branch of the document tree
        $is_inline = $definition->info_parent_def->descendants_are_inline;
        $context->register('IsInline', $is_inline);

        // setup error collector
        $e =& $context->get('ErrorCollector', true);

        //####################################################################//
        // Loop initialization

        // stack that contains the indexes of all parents,
        // $stack[count($stack)-1] being the current parent
        $stack = array();

        // stack that contains all elements that are excluded
        // it is organized by parent elements, similar to $stack,
        // but it is only populated when an element with exclusions is
        // processed, i.e. there won't be empty exclusions.
        $exclude_stack = array();

        // variable that contains the start token while we are processing
        // nodes. This enables error reporting to do its job
        $start_token = false;
        $context->register('CurrentToken', $start_token);

        //####################################################################//
        // Loop

        // iterate through all start nodes. Determining the start node
        // is complicated so it has been omitted from the loop construct
        for ($i = 0, $size = count($tokens) ; $i < $size; ) {

            //################################################################//
            // Gather information on children

            // child token accumulator
            $child_tokens = array();

            // scroll to the end of this node, report number, and collect
            // all children
            for ($j = $i, $depth = 0; ; $j++) {
                if ($tokens[$j] instanceof HTMLPurifier_Token_Start) {
                    $depth++;
                    // skip token assignment on first iteration, this is the
                    // token we currently are on
                    if ($depth == 1) continue;
                } elseif ($tokens[$j] instanceof HTMLPurifier_Token_End) {
                    $depth--;
                    // skip token assignment on last iteration, this is the
                    // end token of the token we're currently on
                    if ($depth == 0) break;
                }
                $child_tokens[] = $tokens[$j];
            }

            // $i is index of start token
            // $j is index of end token

            $start_token = $tokens[$i]; // to make token available via CurrentToken

            //################################################################//
            // Gather information on parent

            // calculate parent information
            if ($count = count($stack)) {
                $parent_index = $stack[$count-1];
                $parent_name  = $tokens[$parent_index]->name;
                if ($parent_index == 0) {
                    $parent_def   = $definition->info_parent_def;
                } else {
                    $parent_def   = $definition->info[$parent_name];
                }
            } else {
                // processing as if the parent were the "root" node
                // unknown info, it won't be used anyway, in the future,
                // we may want to enforce one element only (this is
                // necessary for HTML Purifier to clean entire documents
                $parent_index = $parent_name = $parent_def = null;
            }

            // calculate context
            if ($is_inline === false) {
                // check if conditions make it inline
                if (!empty($parent_def) && $parent_def->descendants_are_inline) {
                    $is_inline = $count - 1;
                }
            } else {
                // check if we're out of inline
                if ($count === $is_inline) {
                    $is_inline = false;
                }
            }

            //################################################################//
            // Determine whether element is explicitly excluded SGML-style

            // determine whether or not element is excluded by checking all
            // parent exclusions. The array should not be very large, two
            // elements at most.
            $excluded = false;
            if (!empty($exclude_stack)) {
                foreach ($exclude_stack as $lookup) {
                    if (isset($lookup[$tokens[$i]->name])) {
                        $excluded = true;
                        // no need to continue processing
                        break;
                    }
                }
            }

            //################################################################//
            // Perform child validation

            if ($excluded) {
                // there is an exclusion, remove the entire node
                $result = false;
                $excludes = array(); // not used, but good to initialize anyway
            } else {
                // DEFINITION CALL
                if ($i === 0) {
                    // special processing for the first node
                    $def = $definition->info_parent_def;
                } else {
                    $def = $definition->info[$tokens[$i]->name];

                }

                if (!empty($def->child)) {
                    // have DTD child def validate children
                    $result = $def->child->validateChildren(
                        $child_tokens, $config, $context);
                } else {
                    // weird, no child definition, get rid of everything
                    $result = false;
                }

                // determine whether or not this element has any exclusions
                $excludes = $def->excludes;
            }

            // $result is now a bool or array

            //################################################################//
            // Process result by interpreting $result

            if ($result === true || $child_tokens === $result) {
                // leave the node as is

                // register start token as a parental node start
                $stack[] = $i;

                // register exclusions if there are any
                if (!empty($excludes)) $exclude_stack[] = $excludes;

                // move cursor to next possible start node
                $i++;

            } elseif($result === false) {
                // remove entire node

                if ($e) {
                    if ($excluded) {
                        $e->send(E_ERROR, 'Strategy_FixNesting: Node excluded');
                    } else {
                        $e->send(E_ERROR, 'Strategy_FixNesting: Node removed');
                    }
                }

                // calculate length of inner tokens and current tokens
                $length = $j - $i + 1;

                // perform removal
                array_splice($tokens, $i, $length);

                // update size
                $size -= $length;

                // there is no start token to register,
                // current node is now the next possible start node
                // unless it turns out that we need to do a double-check

                // this is a rought heuristic that covers 100% of HTML's
                // cases and 99% of all other cases. A child definition
                // that would be tricked by this would be something like:
                // ( | a b c) where it's all or nothing. Fortunately,
                // our current implementation claims that that case would
                // not allow empty, even if it did
                if (!$parent_def->child->allow_empty) {
                    // we need to do a double-check
                    $i = $parent_index;
                    array_pop($stack);
                }

                // PROJECTED OPTIMIZATION: Process all children elements before
                // reprocessing parent node.

            } else {
                // replace node with $result

                // calculate length of inner tokens
                $length = $j - $i - 1;

                if ($e) {
                    if (empty($result) && $length) {
                        $e->send(E_ERROR, 'Strategy_FixNesting: Node contents removed');
                    } else {
                        $e->send(E_WARNING, 'Strategy_FixNesting: Node reorganized');
                    }
                }

                // perform replacement
                array_splice($tokens, $i + 1, $length, $result);

                // update size
                $size -= $length;
                $size += count($result);

                // register start token as a parental node start
                $stack[] = $i;

                // register exclusions if there are any
                if (!empty($excludes)) $exclude_stack[] = $excludes;

                // move cursor to next possible start node
                $i++;

            }

            //################################################################//
            // Scroll to next start node

            // We assume, at this point, that $i is the index of the token
            // that is the first possible new start point for a node.

            // Test if the token indeed is a start tag, if not, move forward
            // and test again.
            $size = count($tokens);
            while ($i < $size and !$tokens[$i] instanceof HTMLPurifier_Token_Start) {
                if ($tokens[$i] instanceof HTMLPurifier_Token_End) {
                    // pop a token index off the stack if we ended a node
                    array_pop($stack);
                    // pop an exclusion lookup off exclusion stack if
                    // we ended node and that node had exclusions
                    if ($i == 0 || $i == $size - 1) {
                        // use specialized var if it's the super-parent
                        $s_excludes = $definition->info_parent_def->excludes;
                    } else {
                        $s_excludes = $definition->info[$tokens[$i]->name]->excludes;
                    }
                    if ($s_excludes) {
                        array_pop($exclude_stack);
                    }
                }
                $i++;
            }

        }

        //####################################################################//
        // Post-processing

        // remove implicit parent tokens at the beginning and end
        array_shift($tokens);
        array_pop($tokens);

        // remove context variables
        $context->destroy('IsInline');
        $context->destroy('CurrentToken');

        //####################################################################//
        // Return

        return $tokens;

    }

}

// vim: et sw=4 sts=4