diff options
Diffstat (limited to 'lib/htmlpurifier/maintenance/generate-entity-file.php')
-rwxr-xr-x | lib/htmlpurifier/maintenance/generate-entity-file.php | 74 |
1 files changed, 74 insertions, 0 deletions
diff --git a/lib/htmlpurifier/maintenance/generate-entity-file.php b/lib/htmlpurifier/maintenance/generate-entity-file.php new file mode 100755 index 000000000..dee8e61e7 --- /dev/null +++ b/lib/htmlpurifier/maintenance/generate-entity-file.php @@ -0,0 +1,74 @@ +#!/usr/bin/php +<?php + +chdir(dirname(__FILE__)); +require_once 'common.php'; +assertCli(); + +/** + * @file + * Parses *.ent files into an entity lookup table, and then serializes and + * writes the whole kaboodle to a file. The resulting file is cached so + * that this script does not need to be run. This script should rarely, + * if ever, be run, since HTML's entities are fairly immutable. + */ + +// here's where the entity files are located, assuming working directory +// is the same as the location of this PHP file. Needs trailing slash. +$entity_dir = '../docs/entities/'; + +// defines the output file for the serialized content. +$output_file = '../library/HTMLPurifier/EntityLookup/entities.ser'; + +// courtesy of a PHP manual comment +function unichr($dec) { + if ($dec < 128) { + $utf = chr($dec); + } else if ($dec < 2048) { + $utf = chr(192 + (($dec - ($dec % 64)) / 64)); + $utf .= chr(128 + ($dec % 64)); + } else { + $utf = chr(224 + (($dec - ($dec % 4096)) / 4096)); + $utf .= chr(128 + ((($dec % 4096) - ($dec % 64)) / 64)); + $utf .= chr(128 + ($dec % 64)); + } + return $utf; +} + +if ( !is_dir($entity_dir) ) exit("Fatal Error: Can't find entity directory.\n"); +if ( file_exists($output_file) ) exit("Fatal Error: output file already exists.\n"); + +$dh = @opendir($entity_dir); +if ( !$dh ) exit("Fatal Error: Cannot read entity directory.\n"); + +$entity_files = array(); +while (($file = readdir($dh)) !== false) { + if (@$file[0] === '.') continue; + if (substr(strrchr($file, "."), 1) !== 'ent') continue; + $entity_files[] = $file; +} +closedir($dh); + +if ( !$entity_files ) exit("Fatal Error: No entity files to parse.\n"); + +$entity_table = array(); +$regexp = '/<!ENTITY\s+([A-Za-z0-9]+)\s+"&#(?:38;#)?([0-9]+);">/'; + +foreach ( $entity_files as $file ) { + $contents = file_get_contents($entity_dir . $file); + $matches = array(); + preg_match_all($regexp, $contents, $matches, PREG_SET_ORDER); + foreach ($matches as $match) { + $entity_table[$match[1]] = unichr($match[2]); + } +} + +$output = serialize($entity_table); + +$fh = fopen($output_file, 'w'); +fwrite($fh, $output); +fclose($fh); + +echo "Completed successfully."; + +// vim: et sw=4 sts=4 |