usort( $shorts, 'WP_Token_Map::longest_first_then_alphabetical' ); foreach ( $groups as $group_key => $group ) { usort( $groups[ $group_key ], static function ( array $a, array $b ): int { return self::longest_first_then_alphabetical( $a[0], $b[0] ); } ); } // Finally construct the optimized lookups. foreach ( $shorts as $word ) { $map->small_words .= str_pad( $word, $key_length + 1, "\x00", STR_PAD_RIGHT ); $map->small_mappings[] = $mappings[ $word ]; } $group_keys = array_keys( $groups ); sort( $group_keys ); foreach ( $group_keys as $group ) { $map->groups .= "{$group}\x00"; $group_string = ''; foreach ( $groups[ $group ] as $group_word ) { list( $word, $mapping ) = $group_word; $word_length = pack( 'C', strlen( $word ) ); $mapping_length = pack( 'C', strlen( $mapping ) ); $group_string .= "{$word_length}{$word}{$mapping_length}{$mapping}"; } $map->large_words[] = $group_string; } return $map; } /** * Creates a token map from a pre-computed table. * This skips the initialization cost of generating the table. * * This function should only be used to load data created with * WP_Token_Map::precomputed_php_source_tag(). * * @since 6.6.0 * * @param array $state { * Stores pre-computed state for directly loading into a Token Map. * * @type string $storage_version Which version of the code produced this state. * @type int $key_length Group key length. * @type string $groups Group lookup index. * @type array $large_words Large word groups and packed strings. * @type string $small_words Small words packed string. * @type array $small_mappings Small word mappings. * } * * @return WP_Token_Map Map with precomputed data loaded. */ public static function from_precomputed_table( $state ): ?WP_Token_Map { $has_necessary_state = isset( $state['storage_version'], $state['key_length'], $state['groups'], $state['large_words'], $state['small_words'], $state['small_mappings'] ); if ( ! $has_necessary_state ) { _doing_it_wrong( __METHOD__, __( 'Missing required inputs to pre-computed WP_Token_Map.' ), '6.6.0' ); return null; } if ( self::STORAGE_VERSION !== $state['storage_version'] ) { _doing_it_wrong( __METHOD__, /* translators: 1: version string, 2: version string. */ sprintf( __( 'Loaded version \'%1$s\' incompatible with expected version \'%2$s\'.' ), $state['storage_version'], self::STORAGE_VERSION ), '6.6.0' ); return null; } $map = new WP_Token_Map(); $map->key_length = $state['key_length']; $map->groups = $state['groups']; $map->large_words = $state['large_words']; $map->small_words = $state['small_words']; $map->small_mappings = $state['small_mappings']; return $map; } /** * Indicates if a given word is a lookup key in the map. * * Example: * * true === $smilies->contains( ':)' ); * false === $smilies->contains( 'simile' ); * * @since 6.6.0 * * @param string $word Determine if this word is a lookup key in the map. * @param string $case_sensitivity Optional. Pass 'ascii-case-insensitive' to ignore ASCII case when matching. Default 'case-sensitive'. * @return bool Whether there's an entry for the given word in the map. */ public function contains( string $word, string $case_sensitivity = 'case-sensitive' ): bool { $ignore_case = 'ascii-case-insensitive' === $case_sensitivity; if ( $this->key_length >= strlen( $word ) ) { if ( 0 === strlen( $this->small_words ) ) { return false; } $term = str_pad( $word, $this->key_length + 1, "\x00", STR_PAD_RIGHT ); $word_at = $ignore_case ? stripos( $this->small_words, $term ) : strpos( $this->small_words, $term ); if ( false === $word_at ) { return false; } return true; } $group_key = substr( $word, 0, $this->key_length ); $group_at = $ignore_case ? stripos( $this->groups, $group_key ) : strpos( $this->groups, $group_key ); if ( false === $group_at ) { return false; } $group = $this->large_words[ $group_at / ( $this->key_length + 1 ) ]; $group_length = strlen( $group ); $slug = substr( $word, $this->key_length ); $length = strlen( $slug ); $at = 0; while ( $at < $group_length ) { $token_length = unpack( 'C', $group[ $at++ ] )[1]; $token_at = $at; $at += $token_length; $mapping_length = unpack( 'C', $group[ $at++ ] )[1]; $mapping_at = $at; if ( $token_length === $length && 0 === substr_compare( $group, $slug, $token_at, $token_length, $ignore_case ) ) { return true; } $at = $mapping_at + $mapping_length; } return false; } /** * If the text starting at a given offset is a lookup key in the map, * return the corresponding transformation from the map, else `false`. * * This function returns the translated string, but accepts an optional * parameter `$matched_token_byte_length`, which communicates how many * bytes long the lookup key was, if it found one. This can be used to * advance a cursor in calling code if a lookup key was found. * * Example: * * false === $smilies->read_token( 'Not sure :?.', 0, $token_byte_length ); * '😕' === $smilies->read_token( 'Not sure :?.', 9, $token_byte_length ); * 2 === $token_byte_length; * * Example: * * while ( $at < strlen( $input ) ) { * $next_at = strpos( $input, ':', $at ); * if ( false === $next_at ) { * break; * } * * $smily = $smilies->read_token( $input, $next_at, $token_byte_length ); * if ( false === $next_at ) { * ++$at; * continue; * } * * $prefix = substr( $input, $at, $next_at - $at ); * $at += $token_byte_length; * $output .= "{$prefix}{$smily}"; * } * * @since 6.6.0 * * @param string $text String in which to search for a lookup key. * @param int $offset Optional. How many bytes into the string where the lookup key ought to start. Default 0. * @param int|null &$matched_token_byte_length Optional. Holds byte-length of found token matched, otherwise not set. Default null. * @param string $case_sensitivity Optional. Pass 'ascii-case-insensitive' to ignore ASCII case when matching. Default 'case-sensitive'. * * @return string|null Mapped value of lookup key if found, otherwise `null`. */ public function read_token( string $text, int $offset = 0, &$matched_token_byte_length = null, $case_sensitivity = 'case-sensitive' ): ?string { $ignore_case = 'ascii-case-insensitive' === $case_sensitivity; $text_length = strlen( $text ); // Search for a long word first, if the text is long enough, and if that fails, a short one. if ( $text_length > $this->key_length ) { $group_key = substr( $text, $offset, $this->key_length ); $group_at = $ignore_case ? stripos( $this->groups, $group_key ) : strpos( $this->groups, $group_key ); if ( false === $group_at ) { // Perhaps a short word then. return strlen( $this->small_words ) > 0 ? $this->read_small_token( $text, $offset, $matched_token_byte_length, $case_sensitivity ) : null; } $group = $this->large_words[ $group_at / ( $this->key_length + 1 ) ]; $group_length = strlen( $group ); $at = 0; while ( $at < $group_length ) { $token_length = unpack( 'C', $group[ $at++ ] )[1]; $token = substr( $group, $at, $token_length ); $at += $token_length; $mapping_length = unpack( 'C', $group[ $at++ ] )[1]; $mapping_at = $at; if ( 0 === substr_compare( $text, $token, $offset + $this->key_length, $token_length, $ignore_case ) ) { $matched_token_byte_length = $this->key_length + $token_length; return substr( $group, $mapping_at, $mapping_length ); } $at = $mapping_at + $mapping_length; } } // Perhaps a short word then. return strlen( $this->small_words ) > 0 ? $this->read_small_token( $text, $offset, $matched_token_byte_length, $case_sensitivity ) : null; } /** * Finds a match for a short word at the index. * * @since 6.6.0 * * @param string $text String in which to search for a lookup key. * @param int $offset Optional. How many bytes into the string where the lookup key ought to start. Default 0. * @param int|null &$matched_token_byte_length Optional. Holds byte-length of found lookup key if matched, otherwise not set. Default null. * @param string $case_sensitivity Optional. Pass 'ascii-case-insensitive' to ignore ASCII case when matching. Default 'case-sensitive'. * * @return string|null Mapped value of lookup key if found, otherwise `null`. */ private function read_small_token( string $text, int $offset = 0, &$matched_token_byte_length = null, $case_sensitivity = 'case-sensitive' ): ?string { $ignore_case = 'ascii-case-insensitive' === $case_sensitivity; $small_length = strlen( $this->small_words ); $search_text = substr( $text, $offset, $this->key_length ); if ( $ignore_case ) { $search_text = strtoupper( $search_text ); } $starting_char = $search_text[0]; $at = 0; while ( $at < $small_length ) { if ( $starting_char !== $this->small_words[ $at ] && ( ! $ignore_case || strtoupper( $this->small_words[ $at ] ) !== $starting_char ) ) { $at += $this->key_length + 1; continue; } for ( $adjust = 1; $adjust < $this->key_length; $adjust++ ) { if ( "\x00" === $this->small_words[ $at + $adjust ] ) { $matched_token_byte_length = $adjust; return $this->small_mappings[ $at / ( $this->key_length + 1 ) ]; } if ( $search_text[ $adjust ] !== $this->small_words[ $at + $adjust ] && ( ! $ignore_case || strtoupper( $this->small_words[ $at + $adjust ] !== $search_text[ $adjust ] ) ) ) { $at += $this->key_length + 1; continue 2; } } $matched_token_byte_length = $adjust; return $this->small_mappings[ $at / ( $this->key_length + 1 ) ]; } return null; } /** * Exports the token map into an associate array of key/value pairs. * * Example: * * $smilies->to_array() === array( * '8O' => '😯', * ':(' => '🙁', * ':)' => '🙂', * ':?' => '😕', * ); * * @return array The lookup key/substitution values as an associate array. */ public function to_array(): array { $tokens = array(); $at = 0; $small_mapping = 0; $small_length = strlen( $this->small_words ); while ( $at < $small_length ) { $key = rtrim( substr( $this->small_words, $at, $this->key_length + 1 ), "\x00" ); $value = $this->small_mappings[ $small_mapping++ ]; $tokens[ $key ] = $value; $at += $this->key_length + 1; } foreach ( $this->large_words as $index => $group ) { $prefix = substr( $this->groups, $index * ( $this->key_length + 1 ), 2 ); $group_length = strlen( $group ); $at = 0; while ( $at < $group_length ) { $length = unpack( 'C', $group[ $at++ ] )[1]; $key = $prefix . substr( $group, $at, $length ); $at += $length; $length = unpack( 'C', $group[ $at++ ] )[1]; $value = substr( $group, $at, $length ); $tokens[ $key ] = $value; $at += $length; } } return $tokens; } /** * Export the token map for quick loading in PHP source code. * * This function has a specific purpose, to make loading of static token maps fast. * It's used to ensure that the HTML character reference lookups add a minimal cost * to initializing the PHP process. * * Example: * * echo $smilies->precomputed_php_source_table(); * * // Output. * WP_Token_Map::from_precomputed_table( * array( * "storage_version" => "6.6.0", * "key_length" => 2, * "groups" => "", * "long_words" => array(), * "small_words" => "8O\x00:)\x00:(\x00:?\x00", * "small_mappings" => array( "😯", "🙂", "🙁", "😕" ) * ) * ); * * @since 6.6.0 * * @param string $indent Optional. Use this string for indentation, or rely on the default horizontal tab character. Default "\t". * @return string Value which can be pasted into a PHP source file for quick loading of table. */ public function precomputed_php_source_table( string $indent = "\t" ): string { $i1 = $indent; $i2 = $i1 . $indent; $i3 = $i2 . $indent; $class_version = self::STORAGE_VERSION; $output = self::class . "::from_precomputed_table(\n"; $output .= "{$i1}array(\n"; $output .= "{$i2}\"storage_version\" => \"{$class_version}\",\n"; $output .= "{$i2}\"key_length\" => {$this->key_length},\n"; $group_line = str_replace( "\x00", "\\x00", $this->groups ); $output .= "{$i2}\"groups\" => \"{$group_line}\",\n"; $output .= "{$i2}\"large_words\" => array(\n"; $prefixes = explode( "\x00", $this->groups ); foreach ( $prefixes as $index => $prefix ) { if ( '' === $prefix ) { break; } $group = $this->large_words[ $index ]; $group_length = strlen( $group ); $comment_line = "{$i3}//"; $data_line = "{$i3}\""; $at = 0; while ( $at < $group_length ) { $token_length = unpack( 'C', $group[ $at++ ] )[1]; $token = substr( $group, $at, $token_length ); $at += $token_length; $mapping_length = unpack( 'C', $group[ $at++ ] )[1]; $mapping = substr( $group, $at, $mapping_length ); $at += $mapping_length; $token_digits = str_pad( dechex( $token_length ), 2, '0', STR_PAD_LEFT ); $mapping_digits = str_pad( dechex( $mapping_length ), 2, '0', STR_PAD_LEFT ); $mapping = preg_replace_callback( "~[\\x00-\\x1f\\x22\\x5c]~", static function ( $match_result ) { switch ( $match_result[0] ) { case '"': return '\\"'; case '\\': return '\\\\'; default: $hex = dechex( ord( $match_result[0] ) ); return "\\x{$hex}"; } }, $mapping ); $comment_line .= " {$prefix}{$token}[{$mapping}]"; $data_line .= "\\x{$token_digits}{$token}\\x{$mapping_digits}{$mapping}"; } $comment_line .= ".\n"; $data_line .= "\",\n"; $output .= $comment_line; $output .= $data_line; } $output .= "{$i2}),\n"; $small_words = array(); $small_length = strlen( $this->small_words ); $at = 0; while ( $at < $small_length ) { $small_words[] = substr( $this->small_words, $at, $this->key_length + 1 ); $at += $this->key_length + 1; } $small_text = str_replace( "\x00", '\x00', implode( '', $small_words ) ); $output .= "{$i2}\"small_words\" => \"{$small_text}\",\n"; $output .= "{$i2}\"small_mappings\" => array(\n"; foreach ( $this->small_mappings as $mapping ) { $output .= "{$i3}\"{$mapping}\",\n"; } $output .= "{$i2})\n"; $output .= "{$i1})\n"; $output .= ')'; return $output; } /** * Compares two strings, returning the longest, or whichever * is first alphabetically if they are the same length. * * This is an important sort when building the token map because * it should not form a match on a substring of a longer potential * match. For example, it should not detect `Cap` when matching * against the string `CapitalDifferentialD`. * * @since 6.6.0 * * @param string $a First string to compare. * @param string $b Second string to compare. * @return int -1 or lower if `$a` is less than `$b`; 1 or greater if `$a` is greater than `$b`, and 0 if they are equal. */ private static function longest_first_then_alphabetical( string $a, string $b ): int { if ( $a === $b ) { return 0; } $length_a = strlen( $a ); $length_b = strlen( $b ); // Longer strings are less-than for comparison's sake. if ( $length_a !== $length_b ) { return $length_b - $length_a; } return strcmp( $a, $b ); } }