[wp-trac] [WordPress Trac] #34631: Extra compat for mbstring: mb_strpos()

WordPress Trac noreply at wordpress.org
Mon Nov 9 12:00:52 UTC 2015


#34631: Extra compat for mbstring: mb_strpos()
-------------------------+-----------------------------
 Reporter:  Cybr         |      Owner:
     Type:  enhancement  |     Status:  new
 Priority:  normal       |  Milestone:  Awaiting Review
Component:  Charset      |    Version:  trunk
 Severity:  normal       |   Keywords:
  Focuses:               |
-------------------------+-----------------------------
 Hello,

 I noticed a missing compat function within compat.php, regarding
 mb_strpos.

 The use of this function within a plugin will result in a fatal error if
 the server doesn't support mbstring.

 So I made a function that will take over the function if it does not
 exist.

 I also implemented debugging errors based on PHP 5.5 source:
 https://github.com/php/php-src/blob/PHP-5.5/ext/standard/string.c#L1824

 {{{#!php

 if ( ! function_exists( 'mb_strpos' ) ) {
         function mb_strpos( $haystack, $needle, $offset = 0, $encoding =
 null ) {
                 return _mb_strpos( $haystack, $needle, $offset, $encoding
 );
         }
 }

 /*
 * Only understands UTF-8 and 8bit.  All other character sets will be
 treated as 8bit.
 * For $encoding === UTF-8, the $str input is expected to be a valid UTF-8
 byte sequence.
 * The behavior of this function for invalid inputs is PHP compliant.
 */
 if ( ! function_exists( '_mb_strpos' ) ) {
         function _mb_strpos( $haystack, $needle, $offset = 0, $encoding =
 null ) {

                 if ( null === $encoding ) {
                         $encoding = get_option( 'blog_charset' );
                 }

                 // The solution below works only for UTF-8,
                 // so in case of a different charset just use built-in
 strpos()
                 if ( ! in_array( $encoding, array( 'utf8', 'utf-8',
 'UTF8', 'UTF-8' ) ) ) {
                         return $offset === 0 ? strpos( $haystack, $needle
 ) : strpos( $haystack, $needle, $offset );
                 }

                 $haystack_len = mb_strlen( $haystack );

                 if ( $offset < (int) 0 || $offset > $haystack_len ) {
                         trigger_error( 'mb_strpos(): Offset not contained
 in string', E_USER_WARNING );
                         return false;
                 }

                 if ( !is_string( $needle ) ) {
                         $needle = (string) $needle;

                         if ( !is_string( $needle ) ) {
                                 trigger_error( 'mb_strpos(): Array to
 string conversion', E_USER_WARNING );
                                 return false;
                         }
                 }

                 if ( empty( $needle ) ) {
                         trigger_error( 'mb_strpos(): Empty needle',
 E_USER_WARNING );
                         return false;
                 }

                 // Slice off the offset
                 $haystack_sub = mb_substr( $haystack, $offset );

                 if ( _wp_can_use_pcre_u() ) {
                         // Use the regex unicode support to separate the
 UTF-8 characters into an array
                         preg_match_all( "/./us", $haystack, $match_h );
                         preg_match_all( "/$needle/us", $haystack_sub,
 $match_n );

                         $pos = key( array_intersect( $match_h[0],
 $match_n[0] ) );

                         if ( empty( $pos ) ) {
                                 return false;
                         }

                         return (int) $pos;
                 }

                 $regex = '/(
                           [\x00-\x7F]                  # single-byte
 sequences   0xxxxxxx
                         | [\xC2-\xDF][\x80-\xBF]       # double-byte
 sequences   110xxxxx 10xxxxxx
                         | \xE0[\xA0-\xBF][\x80-\xBF]   # triple-byte
 sequences   1110xxxx 10xxxxxx * 2
                         | [\xE1-\xEC][\x80-\xBF]{2}
                         | \xED[\x80-\x9F][\x80-\xBF]
                         | [\xEE-\xEF][\x80-\xBF]{2}
                         | \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte
 sequences   11110xxx 10xxxxxx * 3
                         | [\xF1-\xF3][\x80-\xBF]{3}
                         | \xF4[\x80-\x8F][\x80-\xBF]{2}
                 )/x';

                 /**
                  * Place haystack into array
                  */
                 $match_h = array( '' ); // Start with 1 element instead of
 0 since the first thing we do is pop
                 do {
                         // We had some string left over from the last
 round, but we counted it in that last round.
                         array_pop( $match_h );

                         // Split by UTF-8 character, limit to 1000
 characters (last array element will contain the rest of the string)
                         $pieces = preg_split( $regex, $haystack, 1000,
 PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY );

                         $match_h = array_merge( $match_h, $pieces );
                 } while ( count( $pieces ) > 1 && $haystack = array_pop(
 $pieces ) ); // If there's anything left over, repeat the loop.

                 /**
                  * Place haystack offset into array
                  */
                 $match_hs = array( '' ); // Start with 1 element instead
 of 0 since the first thing we do is pop
                 do {
                         // We had some string left over from the last
 round, but we counted it in that last round.
                         array_pop( $match_hs );

                         // Split by UTF-8 character, limit to 1000
 characters (last array element will contain the rest of the string)
                         $pieces = preg_split( $regex, $haystack_sub, 1000,
 PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY );

                         $match_hs = array_merge( $match_hs, $pieces );
                 } while ( count( $pieces ) > 1 && $haystack_sub =
 array_pop( $pieces ) ); // If there's anything left over, repeat the loop.

                 /**
                  * Put needle into array
                  */
                 $match_n = array( '' ); // Start with 1 element instead of
 0 since the first thing we do is pop
                 do {
                         // We had some string left over from the last
 round, but we counted it in that last round.
                         array_pop( $match_n );

                         // Split by UTF-8 character, limit to 1000
 characters (last array element will contain the rest of the string)
                         $pieces = preg_split( $regex, $needle, 1000,
 PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY );

                         $match_n = array_merge( $match_n, $pieces );
                 } while ( count( $pieces ) > 1 && $needle = array_pop(
 $pieces ) ); // If there's anything left over, repeat the loop.

                 /**
                  * Compute match of haystack offset with needle
                  * If passed, find the array key number within the full
 haystack.
                  */
                 $pos = in_array( $match_n[0], $match_hs ) ? key(
 array_intersect( $match_h, $match_n ) ) : '';

                 if ( empty( $pos ) ) {
                         return false;
                 }

                 return (int) $pos;
         }
 }
 }}}


 `if ( ! function_exists( '_mb_strpos' ) ) {` could probably be removed
 since it could be a core function.

 To test this, I've used the following lines of code:

 {{{#!php
 var_dump( _mb_strpos( '象形指事', '指', 0 ) ); // 2
 var_dump( _mb_strpos( '象形指事', '指', 1 ) ); // 2
 var_dump( _mb_strpos( '象形指事', '指', 2 ) ); // 2
 var_dump( _mb_strpos( '象形指事', '指', 3 ) ); // false
 var_dump( _mb_strpos( '象形指事', '指', -1 ) ); // false WARNING
 var_dump( _mb_strpos( '象形指事', '指', 4 ) ); // false
 var_dump( _mb_strpos( '象形指事', '指', 5 ) ); // false WARNING

 echo PHP_EOL.PHP_EOL;

 var_dump( mb_strpos( '象形指事', '指', 0 ) ); // 2
 var_dump( mb_strpos( '象形指事', '指', 1 ) ); // 2
 var_dump( mb_strpos( '象形指事', '指', 2 ) ); // 2
 var_dump( mb_strpos( '象形指事', '指', 3 ) ); // false
 var_dump( mb_strpos( '象形指事', '指', -1 ) ); // false WARNING
 var_dump( mb_strpos( '象形指事', '指', 4 ) ); // false
 var_dump( mb_strpos( '象形指事', '指', 5 ) ); // false WARNING
 }}}

 Feel free to contribute your thoughts :) Thanks!

--
Ticket URL: <https://core.trac.wordpress.org/ticket/34631>
WordPress Trac <https://core.trac.wordpress.org/>
WordPress publishing platform


More information about the wp-trac mailing list