<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head><meta http-equiv="content-type" content="text/html; charset=utf-8" />
<title>[5277] sites/trunk/api.wordpress.org/public_html/events/1.0: Events: Fallback to a `LIKE` query for ideographic languages</title>
</head>
<body>
<style type="text/css"><!--
#msg dl.meta { border: 1px #006 solid; background: #369; padding: 6px; color: #fff; }
#msg dl.meta dt { float: left; width: 6em; font-weight: bold; }
#msg dt:after { content:':';}
#msg dl, #msg dt, #msg ul, #msg li, #header, #footer, #logmsg { font-family: verdana,arial,helvetica,sans-serif; font-size: 10pt; }
#msg dl a { font-weight: bold}
#msg dl a:link { color:#fc3; }
#msg dl a:active { color:#ff0; }
#msg dl a:visited { color:#cc6; }
h3 { font-family: verdana,arial,helvetica,sans-serif; font-size: 10pt; font-weight: bold; }
#msg pre { overflow: auto; background: #ffc; border: 1px #fa0 solid; padding: 6px; }
#logmsg { background: #ffc; border: 1px #fa0 solid; padding: 1em 1em 0 1em; }
#logmsg p, #logmsg pre, #logmsg blockquote { margin: 0 0 1em 0; }
#logmsg p, #logmsg li, #logmsg dt, #logmsg dd { line-height: 14pt; }
#logmsg h1, #logmsg h2, #logmsg h3, #logmsg h4, #logmsg h5, #logmsg h6 { margin: .5em 0; }
#logmsg h1:first-child, #logmsg h2:first-child, #logmsg h3:first-child, #logmsg h4:first-child, #logmsg h5:first-child, #logmsg h6:first-child { margin-top: 0; }
#logmsg ul, #logmsg ol { padding: 0; list-style-position: inside; margin: 0 0 0 1em; }
#logmsg ul { text-indent: -1em; padding-left: 1em; }#logmsg ol { text-indent: -1.5em; padding-left: 1.5em; }
#logmsg > ul, #logmsg > ol { margin: 0 0 1em 0; }
#logmsg pre { background: #eee; padding: 1em; }
#logmsg blockquote { border: 1px solid #fa0; border-left-width: 10px; padding: 1em 1em 0 1em; background: white;}
#logmsg dl { margin: 0; }
#logmsg dt { font-weight: bold; }
#logmsg dd { margin: 0; padding: 0 0 0.5em 0; }
#logmsg dd:before { content:'\00bb';}
#logmsg table { border-spacing: 0px; border-collapse: collapse; border-top: 4px solid #fa0; border-bottom: 1px solid #fa0; background: #fff; }
#logmsg table th { text-align: left; font-weight: normal; padding: 0.2em 0.5em; border-top: 1px dotted #fa0; }
#logmsg table td { text-align: right; border-top: 1px dotted #fa0; padding: 0.2em 0.5em; }
#logmsg table thead th { text-align: center; border-bottom: 1px solid #fa0; }
#logmsg table th.Corner { text-align: left; }
#logmsg hr { border: none 0; border-top: 2px dashed #fa0; height: 1px; }
#header, #footer { color: #fff; background: #636; border: 1px #300 solid; padding: 6px; }
#patch { width: 100%; }
#patch h4 {font-family: verdana,arial,helvetica,sans-serif;font-size:10pt;padding:8px;background:#369;color:#fff;margin:0;}
#patch .propset h4, #patch .binary h4 {margin:0;}
#patch pre {padding:0;line-height:1.2em;margin:0;}
#patch .diff {width:100%;background:#eee;padding: 0 0 10px 0;overflow:auto;}
#patch .propset .diff, #patch .binary .diff {padding:10px 0;}
#patch span {display:block;padding:0 10px;}
#patch .modfile, #patch .addfile, #patch .delfile, #patch .propset, #patch .binary, #patch .copfile {border:1px solid #ccc;margin:10px 0;}
#patch ins {background:#dfd;text-decoration:none;display:block;padding:0 10px;}
#patch del {background:#fdd;text-decoration:none;display:block;padding:0 10px;}
#patch .lines, .info {color:#888;background:#fff;}
--></style>
<div id="msg">
<dl class="meta" style="font-size: 105%">
<dt style="float: left; width: 6em; font-weight: bold">Revision</dt> <dd><a style="font-weight: bold" href="http://meta.trac.wordpress.org/changeset/5277">5277</a><script type="application/ld+json">{"@context":"http://schema.org","@type":"EmailMessage","description":"Review this Commit","action":{"@type":"ViewAction","url":"http://meta.trac.wordpress.org/changeset/5277","name":"Review Commit"}}</script></dd>
<dt style="float: left; width: 6em; font-weight: bold">Author</dt> <dd>iandunn</dd>
<dt style="float: left; width: 6em; font-weight: bold">Date</dt> <dd>2017-04-07 05:20:54 +0000 (Fri, 07 Apr 2017)</dd>
</dl>
<pre style='padding-left: 1em; margin: 2em 0; border-left: 2px solid #ccc; line-height: 1.25; font-size: 105%; font-family: sans-serif'>Events: Fallback to a `LIKE` query for ideographic languages
MySQL < 5.7.6 doesn't support full-text queries for ideographic languages, like Japanese.</pre>
<h3>Modified Paths</h3>
<ul>
<li><a href="#sitestrunkapiwordpressorgpublic_htmlevents10indexphp">sites/trunk/api.wordpress.org/public_html/events/1.0/index.php</a></li>
<li><a href="#sitestrunkapiwordpressorgpublic_htmlevents10teststestindexphp">sites/trunk/api.wordpress.org/public_html/events/1.0/tests/test-index.php</a></li>
</ul>
</div>
<div id="patch">
<h3>Diff</h3>
<a id="sitestrunkapiwordpressorgpublic_htmlevents10indexphp"></a>
<div class="modfile"><h4 style="background-color: #eee; color: inherit; margin: 1em 0; padding: 1.3em; font-size: 115%">Modified: sites/trunk/api.wordpress.org/public_html/events/1.0/index.php</h4>
<pre class="diff"><span>
<span class="info" style="display: block; padding: 0 10px; color: #888">--- sites/trunk/api.wordpress.org/public_html/events/1.0/index.php 2017-04-07 05:20:50 UTC (rev 5276)
+++ sites/trunk/api.wordpress.org/public_html/events/1.0/index.php 2017-04-07 05:20:54 UTC (rev 5277)
</span><span class="lines" style="display: block; padding: 0 10px; color: #888">@@ -149,6 +149,10 @@
</span><span class="cx" style="display: block; padding: 0 10px">
</span><span class="cx" style="display: block; padding: 0 10px"> /*
</span><span class="cx" style="display: block; padding: 0 10px"> * Multi-word queries may contain cities, regions, and countries, so try to extract just the city
</span><ins style="background-color: #dfd; text-decoration:none; display:block; padding: 0 10px">+ *
+ * This won't work for most ideographic languages, because they don't use the space character as a word
+ * delimiter. That's ok, though, because `guess_ideographic_location_from_geonames()` should cover those
+ * cases.
</ins><span class="cx" style="display: block; padding: 0 10px"> */
</span><span class="cx" style="display: block; padding: 0 10px"> if ( ! $guess && $location_word_count >= 2 ) {
</span><span class="cx" style="display: block; padding: 0 10px"> // Catch input like "Portland Maine"
</span><span class="lines" style="display: block; padding: 0 10px; color: #888">@@ -206,10 +210,117 @@
</span><span class="cx" style="display: block; padding: 0 10px"> $timezone
</span><span class="cx" style="display: block; padding: 0 10px"> ) );
</span><span class="cx" style="display: block; padding: 0 10px">
</span><ins style="background-color: #dfd; text-decoration:none; display:block; padding: 0 10px">+ if ( ! is_a( $row, 'stdClass' ) && 'ASCII' !== mb_detect_encoding( $location_name ) ) {
+ $row = guess_ideographic_location_from_geonames( $location_name, $country, $timezone );
+ }
+
</ins><span class="cx" style="display: block; padding: 0 10px"> return $row;
</span><span class="cx" style="display: block; padding: 0 10px"> }
</span><span class="cx" style="display: block; padding: 0 10px">
</span><span class="cx" style="display: block; padding: 0 10px"> /**
</span><ins style="background-color: #dfd; text-decoration:none; display:block; padding: 0 10px">+ * Look for the given ideographic location in the Geonames database
+ *
+ * This is a fallback for situations where the full-text search in `guess_location_from_geonames()` resulted
+ * in a false-negative. MySQL < 5.7.6 doesn't support full-text searches on ideographic languages, because
+ * it cannot determine where the word boundaries are.
+ *
+ * See https://dev.mysql.com/doc/refman/5.7/en/fulltext-restrictions.html
+ *
+ * @param string $location_name
+ * @param string $country
+ * @param string $timezone
+ *
+ * @return stdClass|null
+ */
+function guess_ideographic_location_from_geonames( $location_name, $country, $timezone ) {
+ global $wpdb;
+
+ $ideographic_countries = get_ideographic_counties();
+ $ideographic_country_placeholders = get_prepare_placeholders( count( $ideographic_countries ), '%s' );
+
+ /*
+ * The name is wrapped in commas in order to ensure that we're only matching the exact location, which is
+ * delimited by commas. Otherwise, there would be false positives in situations where `$location_name`
+ * appears in other rows, which happens sometimes.
+ *
+ * Because this will only match entries that are prefixed _and_ postfixed with a comma, it will never match the
+ * first and last entries in the column. That's ok, though, because the first entry is always an airport code
+ * in English, which will be matched by other functions. The last entry is often ideographic, so it'd be nice
+ * to match it, but this is good enough for now.
+ */
+ $escaped_location_name = sprintf( '%%,%s,%%', $wpdb->esc_like( $location_name ) );
+
+ /*
+ * REPLACE() is used because sometimes the `alternatenames` column contains entries where the `asciiname` is
+ * prefixed to an ideographic name; for example: `,Karachi - كراچى,`
+ *
+ * If that prefix is not removed, then the LIKE query will fail in those cases, because
+ * `$escaped_location_name` is wrapped in commas.
+ *
+ * The query is restricted to countries where ideographic languages are common, in order to avoid a full-table
+ * scan.
+ */
+ $query = "
+ SELECT name, latitude, longitude, country
+ FROM `geoname`
+ WHERE
+ country IN ( $ideographic_country_placeholders ) AND
+ REPLACE( alternatenames, CONCAT( asciiname, ' - ' ), '' ) LIKE %s
+ ORDER BY
+ FIELD( %s, country ) DESC,
+ FIELD( %s, timezone ) DESC,
+ population DESC
+ LIMIT 1";
+
+ $prepared_query = $wpdb->prepare(
+ $query,
+ array_merge( $ideographic_countries, array( $escaped_location_name, $country, $timezone ) )
+ );
+
+ return $wpdb->get_row( $prepared_query );
+}
+
+/**
+ * Get an array of countries where ideographic languages are common
+ *
+ * Derived from https://en.wikipedia.org/wiki/List_of_writing_systems#List_of_writing_scripts_by_adoption
+ *
+ * @todo Some of these individual countries may be able to be removed, to further narrow the rows that need to be
+ * scanned by `guess_ideographic_location_from_geonames()`. Some of the entire categories could possibly be
+ * removed too, but let's err on the side of caution for now.
+ */
+function get_ideographic_counties() {
+ $middle_east = array( 'AE', 'BH', 'CY', 'EG', 'IL', 'IR', 'IQ', 'JO', 'KW', 'LB', 'OM', 'PS', 'QA', 'SA', 'SY', 'TR', 'YE' );
+ $north_africa = array( 'DZ', 'EH', 'EG', 'LY', 'MA', 'SD', 'SS', 'TN' );
+
+ $abjad_countries = array_merge( $middle_east, $north_africa, array( 'CN', 'IL', 'IN', 'MY', 'PK' ) );
+ $abugida_countries = array( 'BD', 'BT', 'ER', 'ET', 'ID', 'IN', 'KH', 'LA', 'LK', 'MV', 'MY', 'MU', 'MM', 'NP', 'PK', 'SG', 'TH' );
+ $logographic_countries = array( 'CN', 'JP', 'KR', 'MY', 'SG');
+
+ $all_ideographic_countries = array_merge( $abjad_countries, $abugida_countries, $logographic_countries );
+
+ return array_unique( $all_ideographic_countries );
+}
+
+/**
+ * Build a string of placeholders to pass to `WPDB::prepare()`
+ *
+ * Sometimes it's convenient to be able to generate placeholders for `prepare()` dynamically. For example, when
+ * looping through a multi-dimensional array where the sub-arrays have distinct counts; or when the total
+ * number of items is too large to conveniently count by hand.
+ *
+ * See https://iandunn.name/2016/03/31/generating-dynamic-placeholders-for-wpdb-prepare/
+ *
+ * @param int $number The number of placeholders needed
+ * @param string $format An sprintf()-like format accepted by WPDB::prepare()
+ *
+ * @return string
+ */
+function get_prepare_placeholders( $number, $format ) {
+ return implode( ', ', array_fill( 0, $number, $format ) );
+}
+
+/**
</ins><span class="cx" style="display: block; padding: 0 10px"> * Determine a location for the given IPv4 address
</span><span class="cx" style="display: block; padding: 0 10px"> *
</span><span class="cx" style="display: block; padding: 0 10px"> * @todo - Add support for IPv6 addresses. Otherwise, this will quickly lose effectiveness. As of March 2017, IPv6
</span></span></pre></div>
<a id="sitestrunkapiwordpressorgpublic_htmlevents10teststestindexphp"></a>
<div class="modfile"><h4 style="background-color: #eee; color: inherit; margin: 1em 0; padding: 1.3em; font-size: 115%">Modified: sites/trunk/api.wordpress.org/public_html/events/1.0/tests/test-index.php</h4>
<pre class="diff"><span>
<span class="info" style="display: block; padding: 0 10px; color: #888">--- sites/trunk/api.wordpress.org/public_html/events/1.0/tests/test-index.php 2017-04-07 05:20:50 UTC (rev 5276)
+++ sites/trunk/api.wordpress.org/public_html/events/1.0/tests/test-index.php 2017-04-07 05:20:54 UTC (rev 5277)
</span><span class="lines" style="display: block; padding: 0 10px; color: #888">@@ -304,7 +304,7 @@
</span><span class="cx" style="display: block; padding: 0 10px"> ),
</span><span class="cx" style="display: block; padding: 0 10px"> ),
</span><span class="cx" style="display: block; padding: 0 10px">
</span><del style="background-color: #fdd; text-decoration:none; display:block; padding: 0 10px">- 'city-endonym-non-latin-length-greater-than-4-asia' => array(
</del><ins style="background-color: #dfd; text-decoration:none; display:block; padding: 0 10px">+ 'city-endonym-ideographic-asia1' => array(
</ins><span class="cx" style="display: block; padding: 0 10px"> 'input' => array(
</span><span class="cx" style="display: block; padding: 0 10px"> 'location_name' => '白浜町宇佐崎南',
</span><span class="cx" style="display: block; padding: 0 10px"> 'locale' => 'ja',
</span><span class="lines" style="display: block; padding: 0 10px; color: #888">@@ -318,12 +318,7 @@
</span><span class="cx" style="display: block; padding: 0 10px"> ),
</span><span class="cx" style="display: block; padding: 0 10px"> ),
</span><span class="cx" style="display: block; padding: 0 10px">
</span><del style="background-color: #fdd; text-decoration:none; display:block; padding: 0 10px">- /*
- * @todo
- *
- * This is currently failing, but should pass. It looks like the value is in row 112931.
- */
- 'city-endonym-non-latin-length-greater-than-4-asia2' => array(
</del><ins style="background-color: #dfd; text-decoration:none; display:block; padding: 0 10px">+ 'city-endonym-ideographic-asia2' => array(
</ins><span class="cx" style="display: block; padding: 0 10px"> 'input' => array(
</span><span class="cx" style="display: block; padding: 0 10px"> 'location_name' => 'تهران',
</span><span class="cx" style="display: block; padding: 0 10px"> 'locale' => 'fa_IR',
</span><span class="lines" style="display: block; padding: 0 10px; color: #888">@@ -337,12 +332,7 @@
</span><span class="cx" style="display: block; padding: 0 10px"> ),
</span><span class="cx" style="display: block; padding: 0 10px"> ),
</span><span class="cx" style="display: block; padding: 0 10px">
</span><del style="background-color: #fdd; text-decoration:none; display:block; padding: 0 10px">- /*
- * @todo
- *
- * This is currently failing, but should pass. It looks like the value is in row 1174872
- */
- 'city-endonym-non-latin-length-greater-than-4-asia3' => array(
</del><ins style="background-color: #dfd; text-decoration:none; display:block; padding: 0 10px">+ 'city-endonym-ideographic-asia3' => array(
</ins><span class="cx" style="display: block; padding: 0 10px"> 'input' => array(
</span><span class="cx" style="display: block; padding: 0 10px"> 'location_name' => 'كراچى',
</span><span class="cx" style="display: block; padding: 0 10px"> 'locale' => 'ur',
</span><span class="lines" style="display: block; padding: 0 10px; color: #888">@@ -356,17 +346,22 @@
</span><span class="cx" style="display: block; padding: 0 10px"> ),
</span><span class="cx" style="display: block; padding: 0 10px"> ),
</span><span class="cx" style="display: block; padding: 0 10px">
</span><del style="background-color: #fdd; text-decoration:none; display:block; padding: 0 10px">- /*
- * @todo
- *
- * This is currently failing, but it should pass. One reason it may be failing is that `ft_min_word_len`
- * is set to `4` and `東京` is only `2`.
- *
- * But, there are others that are failing that are >= 4, though, like `シラオカ`, `しらおか`, `Ширахама`, and
- *`すぎと,スギト`. So, there may be additional reasons too.
- */
- 'city-endonym-non-latin-length-less-than-4-asia' => array(
</del><ins style="background-color: #dfd; text-decoration:none; display:block; padding: 0 10px">+ 'city-endonym-ideographic-asia4' => array(
</ins><span class="cx" style="display: block; padding: 0 10px"> 'input' => array(
</span><ins style="background-color: #dfd; text-decoration:none; display:block; padding: 0 10px">+ 'location_name' => '京都',
+ 'locale' => 'ja',
+ 'timezone' => 'Asia/Tokyo',
+ ),
+ 'expected' => array(
+ 'description' => 'kyoto',
+ 'latitude' => '35.021',
+ 'longitude' => '135.754',
+ 'country' => 'JP',
+ ),
+ ),
+
+ 'city-endonym-ideographic-asia5' => array(
+ 'input' => array(
</ins><span class="cx" style="display: block; padding: 0 10px"> 'location_name' => '東京',
</span><span class="cx" style="display: block; padding: 0 10px"> 'locale' => 'ja',
</span><span class="cx" style="display: block; padding: 0 10px"> 'timezone' => 'Asia/Tokyo',
</span></span></pre>
</div>
</div>
</body>
</html>