From 52c063fd45f327169a08d2eadbb2904678f2bb40 Mon Sep 17 00:00:00 2001 From: Oswald Buddenhagen Date: Mon, 23 May 2022 10:12:38 +0200 Subject: [PATCH] add support for UTF-7 mailbox names this finally makes us compliant with IMAP4rev1. how fitting that the meanwhile released IMAP4rev2 demoted UTF-7 to legacy status ... based on a patch by Georgy Kibardin . --- TODO | 2 - src/.gitignore | 1 + src/Makefile.am | 6 +- src/drv_imap.c | 21 ++++ src/imap_p.h | 4 + src/imap_utf7.c | 288 ++++++++++++++++++++++++++++++++++++++++++++ src/tst_imap_utf7.c | 116 ++++++++++++++++++ 7 files changed, 434 insertions(+), 4 deletions(-) create mode 100644 src/imap_utf7.c create mode 100644 src/tst_imap_utf7.c diff --git a/TODO b/TODO index dacdeb4..a04b253 100644 --- a/TODO +++ b/TODO @@ -6,8 +6,6 @@ automatically resume upon transient errors, e.g. "connection reset by peer" or timeout after some data was already transmitted. possibly also try to handle Exchange's "glitches" somehow. -add support for IMAP UTF-7 (for internationalized mailbox names). - uidvalidity lock timeout handling would be a good idea. should complain when multiple Channels match the same folders. diff --git a/src/.gitignore b/src/.gitignore index 5e7fc35..3139876 100644 --- a/src/.gitignore +++ b/src/.gitignore @@ -3,6 +3,7 @@ /mbsync /mdconvert /tst_imap_msgs +/tst_imap_utf7 /tst_msg_cvt /tst_timers /tmp diff --git a/src/Makefile.am b/src/Makefile.am index ab56418..69cf29d 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -5,7 +5,7 @@ mbsync_SOURCES = \ util.c config.c socket.c \ driver.c drv_proxy.c \ - drv_imap.c imap_msgs.c \ + drv_imap.c imap_msgs.c imap_utf7.c \ drv_maildir.c \ sync.c sync_state.c sync_msg_cvt.c \ main.c main_sync.c main_list.c @@ -54,10 +54,12 @@ man_MANS = mbsync.1 $(mdconvert_man) tst_imap_msgs_SOURCES = tst_imap_msgs.c imap_msgs.c util.c +tst_imap_utf7_SOURCES = tst_imap_utf7.c imap_utf7.c util.c + tst_msg_cvt_SOURCES = tst_msg_cvt.c sync_msg_cvt.c util.c tst_msg_cvt_CFLAGS = -DQPRINTF_BUFF=10000 -check_PROGRAMS = tst_imap_msgs tst_msg_cvt +check_PROGRAMS = tst_imap_msgs tst_imap_utf7 tst_msg_cvt TESTS = $(check_PROGRAMS) tst_timers_SOURCES = tst_timers.c util.c diff --git a/src/drv_imap.c b/src/drv_imap.c index 911df57..ad95e3d 100644 --- a/src/drv_imap.c +++ b/src/drv_imap.c @@ -1577,6 +1577,7 @@ list3_rsp_atom( imap_store_t *ctx, char *arg, uint len, int type ATTR_UNUSED ) string_list_t *narg; int argl = (int)len; uint l; + char rarg[1130]; // See imap_utf7_to_utf8() for the origin of that number if (!arg) return LIST_BAD; @@ -1608,6 +1609,16 @@ list3_rsp_atom( imap_store_t *ctx, char *arg, uint len, int type ATTR_UNUSED ) } if (argl >= 5 && !memcmp( arg + argl - 5, ".lock", 5 )) /* workaround broken servers */ return LIST_OK; + if (!(CAP(UTF8_ACCEPT) || CAP(UTF8_ONLY))) { + int rargl = imap_utf7_to_utf8( arg, argl, rarg ); + if (rargl < 0) { + error( "IMAP error: invalid modified-UTF-7 string '%.*s'.\n", argl, arg ); + return LIST_BAD; + } + assert( (uint)rargl < sizeof(rarg) ); + arg = rarg; + argl = rargl; + } if (map_name( arg, argl, (char **)&narg, offsetof(string_list_t, string), ctx->delimiter, "/") < 0) { warn( "IMAP warning: ignoring mailbox %.*s (reserved character '/' in name)\n", argl, arg ); return LIST_OK; @@ -1665,6 +1676,16 @@ prepare_name( char **buf, const imap_store_t *ctx, const char *prefix, const cha return -1; default: memcpy( *buf, prefix, pl ); + if (!(CAP(UTF8_ACCEPT) || CAP(UTF8_ONLY))) { + char *nbuf = imap_utf8_to_utf7( *buf ); + if (!nbuf) { + error( "IMAP error: invalid UTF-8 string '%s'\n", *buf ); + free( *buf ); + return -1; + } + free( *buf ); + *buf = nbuf; + } return 0; } } diff --git a/src/imap_p.h b/src/imap_p.h index 76e02e2..1c7933e 100644 --- a/src/imap_p.h +++ b/src/imap_p.h @@ -10,6 +10,7 @@ #include "driver.h" //#define DEBUG_IMAP_MSGS +//#define DEBUG_IMAP_UTF7 typedef union imap_message { message_t gen; @@ -45,4 +46,7 @@ void reset_imap_messages( imap_messages_t *msgs ); void imap_ensure_relative( imap_messages_t *msgs ); void imap_ensure_absolute( imap_messages_t *msgs ); +char *imap_utf8_to_utf7( const char *buf ); +int imap_utf7_to_utf8( const char *buf, int argl, char *outbuf ); + #endif diff --git a/src/imap_utf7.c b/src/imap_utf7.c new file mode 100644 index 0000000..ac91cdb --- /dev/null +++ b/src/imap_utf7.c @@ -0,0 +1,288 @@ +// SPDX-FileCopyrightText: 2018-2021 Georgy Kibardin +// SPDX-FileCopyrightText: 2022 Oswald Buddenhagen +// SPDX-License-Identifier: GPL-2.0-or-later WITH LicenseRef-isync-GPL-exception +// +// mbsync - mailbox synchronizer +// + +#include "imap_p.h" + +#ifdef DEBUG_IMAP_UTF7 +# define dbg(...) print(__VA_ARGS__) +#else +# define dbg(...) do { } while (0) +#endif + +struct bit_fifo { + unsigned long long value; + uint bits; +}; + +static void +add_bits( struct bit_fifo *fifo, uint bits, uint size ) +{ + fifo->value = (fifo->value << size) | bits; + fifo->bits += size; + assert( fifo->bits <= sizeof(fifo->value) * 8 ); +} + +static uint +eat_bits( struct bit_fifo *fifo, uint size ) +{ + fifo->bits -= size; + return (fifo->value >> fifo->bits) & ((1LL << size) - 1); +} + +static uint +peek_bits( struct bit_fifo *fifo, uint size ) +{ + return (fifo->value >> (fifo->bits - size)) & ((1LL << size) - 1); +} + +static void +add_char( char **p, uint chr ) +{ + *((*p)++) = (char)chr; +} + +static uchar +eat_char( const char **p ) +{ + return (uchar)*((*p)++); +} + +static uint +read_as_utf8( const char **utf8_buf_p ) +{ + uchar chr = eat_char( utf8_buf_p ); + if (chr < 0x80) + return chr; + if ((chr & 0xf8) == 0xf0) { + uchar chr2 = eat_char( utf8_buf_p ); + if ((chr2 & 0xc0) != 0x80) + return ~0; + uchar chr3 = eat_char( utf8_buf_p ); + if ((chr3 & 0xc0) != 0x80) + return ~0; + uchar chr4 = eat_char( utf8_buf_p ); + if ((chr4 & 0xc0) != 0x80) + return ~0; + return ((chr & 0x7) << 18) | + ((chr2 & 0x3f) << 12) | + ((chr3 & 0x3f) << 6) | + (chr4 & 0x3f); + } + if ((chr & 0xf0) == 0xe0) { + uchar chr2 = eat_char( utf8_buf_p ); + if ((chr2 & 0xc0) != 0x80) + return ~0; + uchar chr3 = eat_char( utf8_buf_p ); + if ((chr3 & 0xc0) != 0x80) + return ~0; + return ((chr & 0xf) << 12) | + ((chr2 & 0x3f) << 6) | + (chr3 & 0x3f); + } + if ((chr & 0xe0) == 0xc0) { + uchar chr2 = eat_char( utf8_buf_p ); + if ((chr2 & 0xc0) != 0x80) + return ~0; + return (chr & 0x1f) << 6 | + (chr2 & 0x3f); + } + return ~0; +} + +static int +needs_encoding( uint chr ) +{ + return chr && (chr <= 0x1f || chr >= 0x7f); +} + +static uint +utf16_encode( uint chr ) +{ + chr -= 0x10000; + return (((chr >> 10) + 0xd800) << 16) | ((chr & 0x3ff) + 0xdc00); +} + +static uchar +b64_encode( uint chr ) +{ + assert( chr <= 0x3f ); + return "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"[chr]; +} + +char * +imap_utf8_to_utf7( const char *buf ) +{ + // Size requirements: + // - pass-through: l, 1 => 1 + // - all "&": l * 2, 1 => 2 + // - 7-bit: (l * 2 * 4 + 2) / 3 + 2, ~ l * 2.7, 1 => 5 + // - 3-octet: (l / 3 * 2 * 4 + 2) / 3 + 2, ~ l * 0.9, 3 => 5 + // - 4-octet: (l / 4 * 2 * 2 * 4 + 2) / 3 + 2, ~ l * 1.3, 4 => 8 + // => worst case: "&" and 7-bit alternating: l * 3.5, 2 => 7 + int outsz = strlen( buf ) * 7 / 2 + 3; + char *result = nfmalloc( outsz ); + char *outp = result; + struct bit_fifo fifo = { 0, 0 }; + int encoding = 0; + uint chr; + do { + chr = read_as_utf8( &buf ); + if (chr == ~0U) { + dbg( "Error: invalid UTF-8 string\n" ); + free( result ); + return NULL; + } + if (needs_encoding( chr )) { + if (!encoding) { + add_char( &outp, '&' ); + encoding = 1; + } + if (chr <= 0xffff) + add_bits( &fifo, chr, 16 ); + else + add_bits( &fifo, utf16_encode( chr ), 32 ); + while (fifo.bits >= 6) + add_char( &outp, b64_encode( eat_bits( &fifo, 6 ) ) ); + } else { + if (encoding) { + if (fifo.bits) { + uint trailing_bits = 6 - fifo.bits; + uchar trail = b64_encode( eat_bits( &fifo, fifo.bits ) << trailing_bits ); + add_char( &outp, trail ); + } + add_char( &outp, '-' ); + encoding = 0; + } + add_char( &outp, chr ); + if (chr == '&') + add_char( &outp, '-' ); + } + } while (chr); + assert( (int)(outp - result) <= outsz ); + return result; +} + +static void +write_as_utf8( char **outp, uint chr ) +{ + if (chr <= 0x7f) { + add_char( outp, chr ); + } else if (chr <= 0x7ff) { + add_char( outp, (chr >> 6) | 0xc0 ); + add_char( outp, (chr & 0x3f) | 0x80 ); + } else if (chr <= 0xffff) { + add_char( outp, (chr >> 12) | 0xe0 ); + add_char( outp, ((chr >> 6) & 0x3f) | 0x80 ); + add_char( outp, (chr & 0x3f) | 0x80 ); + } else { + assert( chr <= 0xfffff ); + add_char( outp, (chr >> 18) | 0xf0 ); + add_char( outp, ((chr >> 12) & 0x3f) | 0x80 ); + add_char( outp, ((chr >> 6) & 0x3f) | 0x80 ); + add_char( outp, (chr & 0x3f) | 0x80 ); + } +} + +static int +need_another_16bit( uint bits ) +{ + return (bits & 0xfc00) == 0xd800; +} + +static uint +utf16_decode( uint subject ) +{ + return 0x10000 + (((subject >> 16) - 0xd800) << 10) + ((subject & 0xffff) - 0xdc00); +} + +static uint +b64_decode( uchar chr ) +{ + static uint lu[128] = { + ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, + ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, + ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 62, 63, ~0, ~0, ~0, + 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, ~0, ~0, ~0, ~0, ~0, ~0, + ~0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, ~0, ~0, ~0, ~0, ~0, + ~0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, + 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ~0, ~0, ~0, ~0, ~0, + }; + return lu[chr]; +} + +int +imap_utf7_to_utf8( const char *buf, int bufl, char *outbuf ) +{ + // Size requirements: + // - pass-through: l (shortest worst case) + // - all "&": l / 2, 2 => 1, * .5 + // - 7-bit: ((l - 2) * 3 + 1) / 4 / 2, ~ l * .38, 5 => 1, * .2 + // - 3-octet: ((l - 2) * 3 + 1) / 4 / 2 * 3, ~ l * 1.13, 5 => 3, * .6 (generic worst case) + // - 4-octet: ((l - 2) * 3 + 1) / 4 / 2 / 2 * 4, ~ l * .75, 8 => 4, * .5 + // => reserve bufl * 9 / 8 + char *outp = outbuf; + struct bit_fifo fifo = { 0, 0 }; + const char *bufe = buf + bufl; + while (buf != bufe) { + uchar chr = *buf++; + if (chr != '&') { + if (chr & 0x80) { + dbg( "Error: 8-bit char %x\n", chr ); + return -1; + } + add_char( &outp, chr ); + continue; + } + if (buf == bufe) { + dbg( "Error: unterminated shift sequence\n" ); + return -1; + } + chr = *buf++; + if (chr == '-') { + add_char( &outp, '&' ); + continue; + } + fifo.bits = 0; + do { + if (chr & 0x80) { + dbg( "Error: 8-bit char %x\n", chr ); + return -1; + } + uint bits = b64_decode( chr ); + if (bits == ~0U) { + dbg( "Error: char %x outside alphabet\n", chr ); + return -1; + } + add_bits( &fifo, bits, 6 ); + if (fifo.bits >= 16) { + if (need_another_16bit( peek_bits( &fifo, 16 ) )) { + if (fifo.bits >= 32) { + uint utf16 = eat_bits( &fifo, 32 ); + if ((utf16 & 0xfc00) != 0xdc00) { + dbg( "Error: unpaired UTF-16 surrogate\n" ); + return -1; + } + write_as_utf8( &outp, utf16_decode( utf16 ) ); + } + } else { + write_as_utf8( &outp, eat_bits( &fifo, 16 ) ); + } + } + if (buf == bufe) { + dbg( "Error: unterminated shift sequence\n" ); + return -1; + } + chr = *buf++; + } while (chr != '-'); + if (fifo.bits > 6) { + dbg( "Error: incomplete code point\n" ); + return -1; + } + } + return (int)(outp - outbuf); +} diff --git a/src/tst_imap_utf7.c b/src/tst_imap_utf7.c new file mode 100644 index 0000000..dcc3013 --- /dev/null +++ b/src/tst_imap_utf7.c @@ -0,0 +1,116 @@ +// SPDX-FileCopyrightText: 2022 Oswald Buddenhagen +// SPDX-License-Identifier: GPL-2.0-or-later +// +// isync test suite +// + +#include "imap_p.h" + +static struct { + const char *utf8, *utf7; +} data[] = { + { u8"", "" }, + { u8"1", "1" }, + { u8"word", "word" }, + { u8"&", "&-" }, + { NULL, "&" }, + { NULL, "&-&" }, + { u8"&&", "&-&-" }, + { u8"1&1", "1&-1" }, + { u8"&1&", "&-1&-" }, + { u8"\t", "&AAk-" }, + { NULL, "&AAk" }, + { NULL, "&AA-" }, + { NULL, "&*Ak-" }, + { NULL, "&&-" }, + { u8"m\x7f""ll", "m&AH8-ll" }, + { u8"\t&", "&AAk-&-" }, + { u8"\t&\t", "&AAk-&-&AAk-" }, + { u8"&\t", "&-&AAk-" }, + { u8"&\t&", "&-&AAk-&-" }, + { u8"ä", "&AOQ-" }, + { u8"\x83\x84", NULL }, + { u8"\xc3\xc4", NULL }, + { u8"\xc3", NULL }, + { u8"äö", "&AOQA9g-" }, + { u8"äöü", "&AOQA9gD8-" }, + { u8"Ḁ", "&HgA-" }, + { u8"\xe1\xc8\x80", NULL }, + { u8"\xe1\xb8\xf0", NULL }, + { u8"\xe1\xb8", NULL }, + { u8"\xe1", NULL }, + { u8"Ḁḁ", "&HgAeAQ-" }, + { u8"😂", "&2D3eAg-" }, + { u8"\xf8\x9f\x98\x82", NULL }, + { u8"\xf0\xcf\x98\x82", NULL }, + { u8"\xf0\x9f\xd8\x82", NULL }, + { u8"\xf0\x9f\x98\xe2", NULL }, + { u8"\xf0\x9f\x98", NULL }, + { u8"\xf0\x9f", NULL }, + { u8"\xf0", NULL }, + { NULL, "&2D0-" }, + { u8"😈😎", "&2D3eCNg93g4-" }, + { u8"müll", "m&APw-ll" }, + { u8"mü", "m&APw-" }, + { u8"über", "&APw-ber" }, +}; + +int +main( void ) +{ + int ret = 0; + + for (uint i = 0; i < as(data); i++) { + if (!data[i].utf8) + continue; + xprintf( "To UTF-7 \"%s\" (\"%!s\") ...\n", data[i].utf8, data[i].utf8 ); + char *utf7 = imap_utf8_to_utf7( data[i].utf8 ); + if (utf7) { + if (!data[i].utf7) { + xprintf( "Unexpected success: \"%s\" (\"%!s\")\n", utf7, utf7 ); + ret = 1; + } else if (strcmp( utf7, data[i].utf7 )) { + xprintf( "Mismatch, got \"%s\" (\"%!s\"), want \"%!s\"\n", + utf7, utf7, data[i].utf7 ); + ret = 1; + } + free( utf7 ); + } else { + if (data[i].utf7) { + xprintf( "Conversion failure.\n" ); + ret = 1; + } + } + } + + for (uint i = 0; i < as(data); i++) { + if (!data[i].utf7) + continue; + xprintf( "From UTF-7 \"%!s\" ...\n", data[i].utf7 ); + int utf7len = strlen( data[i].utf7 ); + char utf8buf[1000]; + int utf8len = imap_utf7_to_utf8( data[i].utf7, utf7len, utf8buf ); + if (utf8len >= 0) { + if (!data[i].utf8) { + xprintf( "Unexpected success: \"%.*s\" (\"%.*!s\")\n", + utf8len, utf8buf, utf8len, utf8buf ); + ret = 1; + } else { + int wantlen = strlen( data[i].utf8 ); + if (utf8len != wantlen || memcmp( utf8buf, data[i].utf8, utf8len )) { + xprintf( "Mismatch, got \"%.*s\" (\"%.*!s\"), want \"%s\" (\"%!s\")\n", + utf8len, utf8buf, utf8len, utf8buf, data[i].utf8, data[i].utf8 ); + ret = 1; + } + } + assert( utf8len < utf7len * 9 / 8 + 1 ); + } else { + if (data[i].utf8) { + xprintf( "Conversion failure.\n" ); + ret = 1; + } + } + } + + return ret; +}