Please help me understand how multibyte characters like emoji's are handled in MySQL utf8mb4 fields.
See below for a simple test SQL to illustrate the challenges.
/* Clear Previous Test */
DROP TABLE IF EXISTS `emoji_test`;
DROP TABLE IF EXISTS `emoji_test_with_unique_key`;
/* Build Schema */
CREATE TABLE `emoji_test` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`string` varchar(191) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
`status` tinyint(1) NOT NULL DEFAULT '1',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
CREATE TABLE `emoji_test_with_unique_key` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`string` varchar(191) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
`status` tinyint(1) NOT NULL DEFAULT '1',
PRIMARY KEY (`id`),
UNIQUE KEY `idx_string_status` (`string`,`status`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
/* INSERT data */
# Expected Result is successful insert for each of these.
# However some fail. See comments.
INSERT INTO emoji_test (`string`, `status`) VALUES ('๐ถ', 1); # SUCCESS
INSERT INTO emoji_test (`string`, `status`) VALUES ('๐ฎ', 1); # SUCCESS
INSERT INTO emoji_test (`string`, `status`) VALUES ('๐ฎ๐ถ', 1); # SUCCESS
INSERT INTO emoji_test (`string`, `status`) VALUES ('๐ถ๐ฎ', 1); # SUCCESS
INSERT INTO emoji_test_with_unique_key (`string`, `status`) VALUES ('๐ถ', 1); # SUCCESS
INSERT INTO emoji_test_with_unique_key (`string`, `status`) VALUES ('๐ฎ', 1); # FAIL: Duplicate entry '?-1' for key 'idx_string_status'
INSERT INTO emoji_test_with_unique_key (`string`, `status`) VALUES ('๐ฎ๐ถ', 1); # SUCCESS
INSERT INTO emoji_test_with_unique_key (`string`, `status`) VALUES ('๐ถ๐ฎ', 1); # FAIL: Duplicate entry '??-1' for key 'idx_string_status'
/* Test data */
/* Simple Table */
SELECT * FROM emoji_test WHERE `string` IN ('๐ถ','๐ฎ','๐ฎ๐ถ','๐ถ๐ฎ'); # SUCCESS (all 4 are found)
SELECT * FROM emoji_test WHERE `string` IN ('๐ถ'); # FAIL: Returns both ๐ถ and ๐ฎ
SELECT * FROM emoji_test WHERE `string` IN ('๐ฎ'); # FAIL: Returns both ๐ถ and ๐ฎ
SELECT * FROM emoji_test; # SUCCESS (all 4 are found)
/* Table with Unique Key */
SELECT * FROM emoji_test_with_unique_key WHERE `string` IN ('๐ถ','๐ฎ','๐ฎ๐ถ','๐ถ๐ฎ'); # FAIL: Only 2 are found (due to insert errors above)
SELECT * FROM emoji_test_with_unique_key WHERE `string` IN ('๐ถ'); # SUCCESS
SELECT * FROM emoji_test_with_unique_key WHERE `string` IN ('๐ฎ'); # FAIL: ๐ถ found instead of ๐ฎ
SELECT * FROM emoji_test_with_unique_key; # FAIL: Only 2 records found (๐ถ and ๐ฎ๐ถ)
I'm interested in learning what causes the FAIL
s above and how I can get around this.
Specifically:
- Why do selects for one multibyte character return results for any multibyte character?
- How can I configure an index to handle multibyte characters instead of
?
? - Can you recommend changes to the second
CREATE TABLE
(the one with a unique key) above in such a way that makes all the test queries return successfully?