[wp-trac] [WordPress Trac] #34631: Extra compat for mbstring: mb_strpos()
WordPress Trac
noreply at wordpress.org
Mon Nov 9 12:00:52 UTC 2015
#34631: Extra compat for mbstring: mb_strpos()
-------------------------+-----------------------------
Reporter: Cybr | Owner:
Type: enhancement | Status: new
Priority: normal | Milestone: Awaiting Review
Component: Charset | Version: trunk
Severity: normal | Keywords:
Focuses: |
-------------------------+-----------------------------
Hello,
I noticed a missing compat function within compat.php, regarding
mb_strpos.
The use of this function within a plugin will result in a fatal error if
the server doesn't support mbstring.
So I made a function that will take over the function if it does not
exist.
I also implemented debugging errors based on PHP 5.5 source:
https://github.com/php/php-src/blob/PHP-5.5/ext/standard/string.c#L1824
{{{#!php
if ( ! function_exists( 'mb_strpos' ) ) {
function mb_strpos( $haystack, $needle, $offset = 0, $encoding =
null ) {
return _mb_strpos( $haystack, $needle, $offset, $encoding
);
}
}
/*
* Only understands UTF-8 and 8bit. All other character sets will be
treated as 8bit.
* For $encoding === UTF-8, the $str input is expected to be a valid UTF-8
byte sequence.
* The behavior of this function for invalid inputs is PHP compliant.
*/
if ( ! function_exists( '_mb_strpos' ) ) {
function _mb_strpos( $haystack, $needle, $offset = 0, $encoding =
null ) {
if ( null === $encoding ) {
$encoding = get_option( 'blog_charset' );
}
// The solution below works only for UTF-8,
// so in case of a different charset just use built-in
strpos()
if ( ! in_array( $encoding, array( 'utf8', 'utf-8',
'UTF8', 'UTF-8' ) ) ) {
return $offset === 0 ? strpos( $haystack, $needle
) : strpos( $haystack, $needle, $offset );
}
$haystack_len = mb_strlen( $haystack );
if ( $offset < (int) 0 || $offset > $haystack_len ) {
trigger_error( 'mb_strpos(): Offset not contained
in string', E_USER_WARNING );
return false;
}
if ( !is_string( $needle ) ) {
$needle = (string) $needle;
if ( !is_string( $needle ) ) {
trigger_error( 'mb_strpos(): Array to
string conversion', E_USER_WARNING );
return false;
}
}
if ( empty( $needle ) ) {
trigger_error( 'mb_strpos(): Empty needle',
E_USER_WARNING );
return false;
}
// Slice off the offset
$haystack_sub = mb_substr( $haystack, $offset );
if ( _wp_can_use_pcre_u() ) {
// Use the regex unicode support to separate the
UTF-8 characters into an array
preg_match_all( "/./us", $haystack, $match_h );
preg_match_all( "/$needle/us", $haystack_sub,
$match_n );
$pos = key( array_intersect( $match_h[0],
$match_n[0] ) );
if ( empty( $pos ) ) {
return false;
}
return (int) $pos;
}
$regex = '/(
[\x00-\x7F] # single-byte
sequences 0xxxxxxx
| [\xC2-\xDF][\x80-\xBF] # double-byte
sequences 110xxxxx 10xxxxxx
| \xE0[\xA0-\xBF][\x80-\xBF] # triple-byte
sequences 1110xxxx 10xxxxxx * 2
| [\xE1-\xEC][\x80-\xBF]{2}
| \xED[\x80-\x9F][\x80-\xBF]
| [\xEE-\xEF][\x80-\xBF]{2}
| \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte
sequences 11110xxx 10xxxxxx * 3
| [\xF1-\xF3][\x80-\xBF]{3}
| \xF4[\x80-\x8F][\x80-\xBF]{2}
)/x';
/**
* Place haystack into array
*/
$match_h = array( '' ); // Start with 1 element instead of
0 since the first thing we do is pop
do {
// We had some string left over from the last
round, but we counted it in that last round.
array_pop( $match_h );
// Split by UTF-8 character, limit to 1000
characters (last array element will contain the rest of the string)
$pieces = preg_split( $regex, $haystack, 1000,
PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY );
$match_h = array_merge( $match_h, $pieces );
} while ( count( $pieces ) > 1 && $haystack = array_pop(
$pieces ) ); // If there's anything left over, repeat the loop.
/**
* Place haystack offset into array
*/
$match_hs = array( '' ); // Start with 1 element instead
of 0 since the first thing we do is pop
do {
// We had some string left over from the last
round, but we counted it in that last round.
array_pop( $match_hs );
// Split by UTF-8 character, limit to 1000
characters (last array element will contain the rest of the string)
$pieces = preg_split( $regex, $haystack_sub, 1000,
PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY );
$match_hs = array_merge( $match_hs, $pieces );
} while ( count( $pieces ) > 1 && $haystack_sub =
array_pop( $pieces ) ); // If there's anything left over, repeat the loop.
/**
* Put needle into array
*/
$match_n = array( '' ); // Start with 1 element instead of
0 since the first thing we do is pop
do {
// We had some string left over from the last
round, but we counted it in that last round.
array_pop( $match_n );
// Split by UTF-8 character, limit to 1000
characters (last array element will contain the rest of the string)
$pieces = preg_split( $regex, $needle, 1000,
PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY );
$match_n = array_merge( $match_n, $pieces );
} while ( count( $pieces ) > 1 && $needle = array_pop(
$pieces ) ); // If there's anything left over, repeat the loop.
/**
* Compute match of haystack offset with needle
* If passed, find the array key number within the full
haystack.
*/
$pos = in_array( $match_n[0], $match_hs ) ? key(
array_intersect( $match_h, $match_n ) ) : '';
if ( empty( $pos ) ) {
return false;
}
return (int) $pos;
}
}
}}}
`if ( ! function_exists( '_mb_strpos' ) ) {` could probably be removed
since it could be a core function.
To test this, I've used the following lines of code:
{{{#!php
var_dump( _mb_strpos( '象形指事', '指', 0 ) ); // 2
var_dump( _mb_strpos( '象形指事', '指', 1 ) ); // 2
var_dump( _mb_strpos( '象形指事', '指', 2 ) ); // 2
var_dump( _mb_strpos( '象形指事', '指', 3 ) ); // false
var_dump( _mb_strpos( '象形指事', '指', -1 ) ); // false WARNING
var_dump( _mb_strpos( '象形指事', '指', 4 ) ); // false
var_dump( _mb_strpos( '象形指事', '指', 5 ) ); // false WARNING
echo PHP_EOL.PHP_EOL;
var_dump( mb_strpos( '象形指事', '指', 0 ) ); // 2
var_dump( mb_strpos( '象形指事', '指', 1 ) ); // 2
var_dump( mb_strpos( '象形指事', '指', 2 ) ); // 2
var_dump( mb_strpos( '象形指事', '指', 3 ) ); // false
var_dump( mb_strpos( '象形指事', '指', -1 ) ); // false WARNING
var_dump( mb_strpos( '象形指事', '指', 4 ) ); // false
var_dump( mb_strpos( '象形指事', '指', 5 ) ); // false WARNING
}}}
Feel free to contribute your thoughts :) Thanks!
--
Ticket URL: <https://core.trac.wordpress.org/ticket/34631>
WordPress Trac <https://core.trac.wordpress.org/>
WordPress publishing platform
More information about the wp-trac
mailing list