[wp-trac] [WordPress Trac] #29717: wp_check_invalid_utf8 - pcre tricks and failsafes, +mb_convert_encoding, iconv fix, performance

WordPress Trac noreply at wordpress.org
Fri Oct 17 05:38:10 UTC 2014


#29717: wp_check_invalid_utf8 - pcre tricks and failsafes, +mb_convert_encoding,
iconv fix, performance
--------------------------------+------------------------------------------
 Reporter:  askapache           |       Owner:
     Type:  enhancement         |      Status:  new
 Priority:  normal              |   Milestone:  Awaiting Review
Component:  Formatting          |     Version:  trunk
 Severity:  normal              |  Resolution:
 Keywords:  has-patch dev-      |     Focuses:  administration, performance
  feedback                      |
--------------------------------+------------------------------------------

Comment (by askapache):

 Did some benchmarking on both valid and invalid, super long and normal
 length strings.

 At first I was also using mb_check_encoding, but it would cause max
 execution time errors even on medium sized strings.

 {{{
 /*
 BENCHMARKS ON INVALID STRING (750,000 iterations)
  mb_strlen 16,049,664      750k time  avg from 15k iterations
     preg_match_modifier)   0.73318    0.014659
    preg_match_backtrack)   0.73956    0.014787
        htmlspecialchars)   45.36456   0.907278
      preg_match_pattern)   2.06490    0.041293
       mb_check_encoding)   CRASHED, IT CHECKS ENTIRE STRING SO TAKES
 FOREVERRRRRRR

  mb_strlen 2,674,944       750k time  avg from 15k iterations
     preg_match_modifier)   0.76279    0.015250
    preg_match_backtrack)   0.75758    0.015147
        htmlspecialchars)   0.83401    0.016673
      preg_match_pattern)   2.15377    0.043068

  mb_strlen 344             750k time  avg from 15k iterations
     preg_match_modifier)   0.74996    0.014995
    preg_match_backtrack)   0.73503    0.014697
        htmlspecialchars)   0.70115    0.014019
      preg_match_pattern)   2.06986    0.041393


 BENCHMARKS ON VALID STRING (750,000 iterations)

     strlen 26,873,856      750k time  avg from 15k iterations
     preg_match_modifier)   0.74948    0.014984
    preg_match_backtrack)   0.75690    0.015133
        htmlspecialchars)   44.17337   0.883453
      preg_match_pattern)   10.71417   0.214273

     strlen 16              750k time  avg from 15k iterations
     preg_match_modifier)   0.79939      0.015984
    preg_match_backtrack)   0.80240      0.016044
        htmlspecialchars)   0.86205      0.017237
      preg_match_pattern)   10.63511     0.212693
 */
 class utf_validity {
   public function preg_match_modifier($string) {
     return ( preg_match( '//u', $string ) !== false );
   }
   public function preg_match_backtrack($string) {
     return ( preg_match( '/(*UTF8)/', $string ) !== false );
   }
   public function htmlspecialchars($string) {
     return ( htmlspecialchars( $string, null, 'utf-8' ) != '' );
   }
   public function mb_check_encoding($string) {
     return ( mb_check_encoding( $string, 'UTF-8' ) );
   }
   public function preg_match_pattern($string) {
     static $pattern;
     if ( $pattern == null ) {
       $pattern = '/('
       . '[\xC0-\xC1]' # Invalid UTF-8 Bytes
       . '|[\xF5-\xFF]' # Invalid UTF-8 Bytes
       . '|\xE0[\x80-\x9F]' # Overlong encoding of prior code point
       . '|\xF0[\x80-\x8F]' # Overlong encoding of prior code point
       . '|[\xC2-\xDF](?![\x80-\xBF])' # Invalid UTF-8 Sequence Start
       . '|[\xE0-\xEF](?![\x80-\xBF]{2})' # Invalid UTF-8 Sequence Start
       . '|[\xF0-\xF4](?![\x80-\xBF]{3})' # Invalid UTF-8 Sequence Start
       . '|(?<=[\x0-\x7F\xF5-\xFF])[\x80-\xBF]' # Invalid UTF-8 Sequence
 Middle
       .
 '|(?<![\xC2-\xDF]|[\xE0-\xEF]|[\xE0-\xEF][\x80-\xBF]|[\xF0-\xF4]|[\xF0-\xF4][\x80-\xBF]|[\xF0-\xF4][\x80-\xBF]{2})[\x80-\xBF]'
 # Overlong Sequence
       . '|(?<=[\xE0-\xEF])[\x80-\xBF](?![\x80-\xBF])' # Short 3 byte
 sequence
       . '|(?<=[\xF0-\xF4])[\x80-\xBF](?![\x80-\xBF]{2})' # Short 4 byte
 sequence
       . '|(?<=[\xF0-\xF4][\x80-\xBF])[\x80-\xBF](?![\x80-\xBF])' # Short 4
 byte sequence (2)
       . ')/';
     }
     return ( preg_match( $pattern, $string ) != 1 );
   }
 }

 }}}

--
Ticket URL: <https://core.trac.wordpress.org/ticket/29717#comment:10>
WordPress Trac <https://core.trac.wordpress.org/>
WordPress publishing platform


More information about the wp-trac mailing list