add support for UTF-7 mailbox names

this finally makes us compliant with IMAP4rev1. how fitting that the meanwhile released IMAP4rev2 demoted UTF-7 to legacy status ... based on a patch by Georgy Kibardin <georgy@kibardin.name>.
2022-05-23 10:12:38 +02:00 · 2022-05-23 10:12:38 +02:00 · 52c063fd45
commit 52c063fd45
parent efab63fb8e
7 changed files with 434 additions and 4 deletions
--- a/2
+++ b/2
@ -6,8 +6,6 @@ automatically resume upon transient errors, e.g. "connection reset by peer"
 or timeout after some data was already transmitted.
 possibly also try to handle Exchange's "glitches" somehow.
 add support for IMAP UTF-7 (for internationalized mailbox names).
 uidvalidity lock timeout handling would be a good idea.
 should complain when multiple Channels match the same folders.
--- a/src/.gitignore
+++ b/src/.gitignore
@ -3,6 +3,7 @@
 /mbsync
 /mdconvert
 /tst_imap_msgs
 /tst_imap_utf7
 /tst_msg_cvt
 /tst_timers
 /tmp
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -5,7 +5,7 @@
 mbsync_SOURCES = \
 	util.c config.c socket.c \
 	driver.c drv_proxy.c \
-	drv_imap.c imap_msgs.c \
+	drv_imap.c imap_msgs.c imap_utf7.c \
 	drv_maildir.c \
 	sync.c sync_state.c sync_msg_cvt.c \
 	main.c main_sync.c main_list.c
@ -54,10 +54,12 @@ man_MANS = mbsync.1 $(mdconvert_man)
 tst_imap_msgs_SOURCES = tst_imap_msgs.c imap_msgs.c util.c
 tst_imap_utf7_SOURCES = tst_imap_utf7.c imap_utf7.c util.c
 tst_msg_cvt_SOURCES = tst_msg_cvt.c sync_msg_cvt.c util.c
 tst_msg_cvt_CFLAGS = -DQPRINTF_BUFF=10000
-check_PROGRAMS = tst_imap_msgs tst_msg_cvt
+check_PROGRAMS = tst_imap_msgs tst_imap_utf7 tst_msg_cvt
 TESTS = $(check_PROGRAMS)
 tst_timers_SOURCES = tst_timers.c util.c
--- a/src/drv_imap.c
+++ b/src/drv_imap.c
@ -1577,6 +1577,7 @@ list3_rsp_atom( imap_store_t *ctx, char *arg, uint len, int type ATTR_UNUSED )
 	string_list_t *narg;
 	int argl = (int)len;
 	uint l;
 	char rarg[1130];  // See imap_utf7_to_utf8() for the origin of that number
 	if (!arg)
 		return LIST_BAD;
@ -1608,6 +1609,16 @@ list3_rsp_atom( imap_store_t *ctx, char *arg, uint len, int type ATTR_UNUSED )
 	}
 	if (argl >= 5 && !memcmp( arg + argl - 5, ".lock", 5 )) /* workaround broken servers */
 		return LIST_OK;
 	if (!(CAP(UTF8_ACCEPT) || CAP(UTF8_ONLY))) {
 		int rargl = imap_utf7_to_utf8( arg, argl, rarg );
 		if (rargl < 0) {
 			error( "IMAP error: invalid modified-UTF-7 string '%.*s'.\n", argl, arg );
 			return LIST_BAD;
 		}
 		assert( (uint)rargl < sizeof(rarg) );
 		arg = rarg;
 		argl = rargl;
 	}
 	if (map_name( arg, argl, (char **)&narg, offsetof(string_list_t, string), ctx->delimiter, "/") < 0) {
 		warn( "IMAP warning: ignoring mailbox %.*s (reserved character '/' in name)\n", argl, arg );
 		return LIST_OK;
@ -1665,6 +1676,16 @@ prepare_name( char **buf, const imap_store_t *ctx, const char *prefix, const cha
 		return -1;
 	default:
 		memcpy( *buf, prefix, pl );
 		if (!(CAP(UTF8_ACCEPT) || CAP(UTF8_ONLY))) {
 			char *nbuf = imap_utf8_to_utf7( *buf );
 			if (!nbuf) {
 				error( "IMAP error: invalid UTF-8 string '%s'\n", *buf );
 				free( *buf );
 				return -1;
 			}
 			free( *buf );
 			*buf = nbuf;
 		}
 		return 0;
 	}
 }
--- a/src/imap_p.h
+++ b/src/imap_p.h
@ -10,6 +10,7 @@
 #include "driver.h"
 //#define DEBUG_IMAP_MSGS
 //#define DEBUG_IMAP_UTF7
 typedef union imap_message {
 	message_t gen;
@ -45,4 +46,7 @@ void reset_imap_messages( imap_messages_t *msgs );
 void imap_ensure_relative( imap_messages_t *msgs );
 void imap_ensure_absolute( imap_messages_t *msgs );
 char *imap_utf8_to_utf7( const char *buf );
 int imap_utf7_to_utf8( const char *buf, int argl, char *outbuf );
 #endif
--- a/src/imap_utf7.c
+++ b/src/imap_utf7.c
@ -0,0 +1,288 @@
 // SPDX-FileCopyrightText: 2018-2021 Georgy Kibardin <georgy@kibardin.name>
 // SPDX-FileCopyrightText: 2022 Oswald Buddenhagen <ossi@users.sf.net>
 // SPDX-License-Identifier: GPL-2.0-or-later WITH LicenseRef-isync-GPL-exception
 //
 // mbsync - mailbox synchronizer
 //
 #include "imap_p.h"
 #ifdef DEBUG_IMAP_UTF7
 # define dbg(...) print(__VA_ARGS__)
 #else
 # define dbg(...) do { } while (0)
 #endif
 struct bit_fifo {
 	unsigned long long value;
 	uint bits;
 };
 static void
 add_bits( struct bit_fifo *fifo, uint bits, uint size )
 {
 	fifo->value = (fifo->value << size) | bits;
 	fifo->bits += size;
 	assert( fifo->bits <= sizeof(fifo->value) * 8 );
 }
 static uint
 eat_bits( struct bit_fifo *fifo, uint size )
 {
 	fifo->bits -= size;
 	return (fifo->value >> fifo->bits) & ((1LL << size) - 1);
 }
 static uint
 peek_bits( struct bit_fifo *fifo, uint size )
 {
 	return (fifo->value >> (fifo->bits - size)) & ((1LL << size) - 1);
 }
 static void
 add_char( char **p, uint chr )
 {
 	*((*p)++) = (char)chr;
 }
 static uchar
 eat_char( const char **p )
 {
 	return (uchar)*((*p)++);
 }
 static uint
 read_as_utf8( const char **utf8_buf_p )
 {
 	uchar chr = eat_char( utf8_buf_p );
 	if (chr < 0x80)
 		return chr;
 	if ((chr & 0xf8) == 0xf0) {
 		uchar chr2 = eat_char( utf8_buf_p );
 		if ((chr2 & 0xc0) != 0x80)
 			return ~0;
 		uchar chr3 = eat_char( utf8_buf_p );
 		if ((chr3 & 0xc0) != 0x80)
 			return ~0;
 		uchar chr4 = eat_char( utf8_buf_p );
 		if ((chr4 & 0xc0) != 0x80)
 			return ~0;
 		return ((chr & 0x7) << 18) |
 		       ((chr2 & 0x3f) << 12) |
 		       ((chr3 & 0x3f) << 6) |
 		       (chr4 & 0x3f);
 	}
 	if ((chr & 0xf0) == 0xe0) {
 		uchar chr2 = eat_char( utf8_buf_p );
 		if ((chr2 & 0xc0) != 0x80)
 			return ~0;
 		uchar chr3 = eat_char( utf8_buf_p );
 		if ((chr3 & 0xc0) != 0x80)
 			return ~0;
 		return ((chr & 0xf) << 12) |
 		       ((chr2 & 0x3f) << 6) |
 		       (chr3 & 0x3f);
 	}
 	if ((chr & 0xe0) == 0xc0) {
 		uchar chr2 = eat_char( utf8_buf_p );
 		if ((chr2 & 0xc0) != 0x80)
 			return ~0;
 		return (chr & 0x1f) << 6 |
 		       (chr2 & 0x3f);
 	}
 	return ~0;
 }
 static int
 needs_encoding( uint chr )
 {
 	return chr && (chr <= 0x1f || chr >= 0x7f);
 }
 static uint
 utf16_encode( uint chr )
 {
 	chr -= 0x10000;
 	return (((chr >> 10) + 0xd800) << 16) | ((chr & 0x3ff) + 0xdc00);
 }
 static uchar
 b64_encode( uint chr )
 {
 	assert( chr <= 0x3f );
 	return "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"[chr];
 }
 char *
 imap_utf8_to_utf7( const char *buf )
 {
 	// Size requirements:
 	// - pass-through: l, 1 => 1
 	// - all "&": l * 2, 1 => 2
 	// - 7-bit: (l * 2 * 4 + 2) / 3 + 2, ~ l * 2.7, 1 => 5
 	// - 3-octet: (l / 3 * 2 * 4 + 2) / 3 + 2, ~ l * 0.9, 3 => 5
 	// - 4-octet: (l / 4 * 2 * 2 * 4 + 2) / 3 + 2, ~ l * 1.3, 4 => 8
 	// => worst case: "&" and 7-bit alternating: l * 3.5, 2 => 7
 	int outsz = strlen( buf ) * 7 / 2 + 3;
 	char *result = nfmalloc( outsz );
 	char *outp = result;
 	struct bit_fifo fifo = { 0, 0 };
 	int encoding = 0;
 	uint chr;
 	do {
 		chr = read_as_utf8( &buf );
 		if (chr == ~0U) {
 			dbg( "Error: invalid UTF-8 string\n" );
 			free( result );
 			return NULL;
 		}
 		if (needs_encoding( chr )) {
 			if (!encoding) {
 				add_char( &outp, '&' );
 				encoding = 1;
 			}
 			if (chr <= 0xffff)
 				add_bits( &fifo, chr, 16 );
 			else
 				add_bits( &fifo, utf16_encode( chr ), 32 );
 			while (fifo.bits >= 6)
 				add_char( &outp, b64_encode( eat_bits( &fifo, 6 ) ) );
 		} else {
 			if (encoding) {
 				if (fifo.bits) {
 					uint trailing_bits = 6 - fifo.bits;
 					uchar trail = b64_encode( eat_bits( &fifo, fifo.bits ) << trailing_bits );
 					add_char( &outp, trail );
 				}
 				add_char( &outp, '-' );
 				encoding = 0;
 			}
 			add_char( &outp, chr );
 			if (chr == '&')
 				add_char( &outp, '-' );
 		}
 	} while (chr);
 	assert( (int)(outp - result) <= outsz );
 	return result;
 }
 static void
 write_as_utf8( char **outp, uint chr )
 {
 	if (chr <= 0x7f) {
 		add_char( outp, chr );
 	} else if (chr <= 0x7ff) {
 		add_char( outp, (chr >> 6) | 0xc0 );
 		add_char( outp, (chr & 0x3f) | 0x80 );
 	} else if (chr <= 0xffff) {
 		add_char( outp, (chr >> 12) | 0xe0 );
 		add_char( outp, ((chr >> 6) & 0x3f) | 0x80 );
 		add_char( outp, (chr & 0x3f) | 0x80 );
 	} else {
 		assert( chr <= 0xfffff );
 		add_char( outp, (chr >> 18) | 0xf0 );
 		add_char( outp, ((chr >> 12) & 0x3f) | 0x80 );
 		add_char( outp, ((chr >> 6) & 0x3f) | 0x80 );
 		add_char( outp, (chr & 0x3f) | 0x80 );
 	}
 }
 static int
 need_another_16bit( uint bits )
 {
 	return (bits & 0xfc00) == 0xd800;
 }
 static uint
 utf16_decode( uint subject )
 {
 	return 0x10000 + (((subject >> 16) - 0xd800) << 10) + ((subject & 0xffff) - 0xdc00);
 }
 static uint
 b64_decode( uchar chr )
 {
 	static uint lu[128] = {
 		~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0,
 		~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0,
 		~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 62, 63, ~0, ~0, ~0,
 		52, 53, 54, 55, 56, 57, 58, 59, 60, 61, ~0, ~0, ~0, ~0, ~0, ~0,
 		~0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
 		15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, ~0, ~0, ~0, ~0, ~0,
 		~0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
 		41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ~0, ~0, ~0, ~0, ~0,
 	};
 	return lu[chr];
 }
 int
 imap_utf7_to_utf8( const char *buf, int bufl, char *outbuf )
 {
 	// Size requirements:
 	// - pass-through: l (shortest worst case)
 	// - all "&": l / 2, 2 => 1, * .5
 	// - 7-bit: ((l - 2) * 3 + 1) / 4 / 2, ~ l * .38, 5 => 1, * .2
 	// - 3-octet: ((l - 2) * 3 + 1) / 4 / 2 * 3, ~ l * 1.13, 5 => 3, * .6 (generic worst case)
 	// - 4-octet: ((l - 2) * 3 + 1) / 4 / 2 / 2 * 4, ~ l * .75, 8 => 4, * .5
 	// => reserve bufl * 9 / 8
 	char *outp = outbuf;
 	struct bit_fifo fifo = { 0, 0 };
 	const char *bufe = buf + bufl;
 	while (buf != bufe) {
 		uchar chr = *buf++;
 		if (chr != '&') {
 			if (chr & 0x80) {
 				dbg( "Error: 8-bit char %x\n", chr );
 				return -1;
 			}
 			add_char( &outp, chr );
 			continue;
 		}
 		if (buf == bufe) {
 			dbg( "Error: unterminated shift sequence\n" );
 			return -1;
 		}
 		chr = *buf++;
 		if (chr == '-') {
 			add_char( &outp, '&' );
 			continue;
 		}
 		fifo.bits = 0;
 		do {
 			if (chr & 0x80) {
 				dbg( "Error: 8-bit char %x\n", chr );
 				return -1;
 			}
 			uint bits = b64_decode( chr );
 			if (bits == ~0U) {
 				dbg( "Error: char %x outside alphabet\n", chr );
 				return -1;
 			}
 			add_bits( &fifo, bits, 6 );
 			if (fifo.bits >= 16) {
 				if (need_another_16bit( peek_bits( &fifo, 16 ) )) {
 					if (fifo.bits >= 32) {
 						uint utf16 = eat_bits( &fifo, 32 );
 						if ((utf16 & 0xfc00) != 0xdc00) {
 							dbg( "Error: unpaired UTF-16 surrogate\n" );
 							return -1;
 						}
 						write_as_utf8( &outp, utf16_decode( utf16 ) );
 					}
 				} else {
 					write_as_utf8( &outp, eat_bits( &fifo, 16 ) );
 				}
 			}
 			if (buf == bufe) {
 				dbg( "Error: unterminated shift sequence\n" );
 				return -1;
 			}
 			chr = *buf++;
 		} while (chr != '-');
 		if (fifo.bits > 6) {
 			dbg( "Error: incomplete code point\n" );
 			return -1;
 		}
 	}
 	return (int)(outp - outbuf);
 }
--- a/src/tst_imap_utf7.c
+++ b/src/tst_imap_utf7.c
@ -0,0 +1,116 @@
 // SPDX-FileCopyrightText: 2022 Oswald Buddenhagen <ossi@users.sf.net>
 // SPDX-License-Identifier: GPL-2.0-or-later
 //
 // isync test suite
 //
 #include "imap_p.h"
 static struct {
 	const char *utf8, *utf7;
 } data[] = {
 	{ u8"", "" },
 	{ u8"1", "1" },
 	{ u8"word", "word" },
 	{ u8"&", "&-" },
 	{ NULL, "&" },
 	{ NULL, "&-&" },
 	{ u8"&&", "&-&-" },
 	{ u8"1&1", "1&-1" },
 	{ u8"&1&", "&-1&-" },
 	{ u8"\t", "&AAk-" },
 	{ NULL, "&AAk" },
 	{ NULL, "&AA-" },
 	{ NULL, "&*Ak-" },
 	{ NULL, "&&-" },
 	{ u8"m\x7f""ll", "m&AH8-ll" },
 	{ u8"\t&", "&AAk-&-" },
 	{ u8"\t&\t", "&AAk-&-&AAk-" },
 	{ u8"&\t", "&-&AAk-" },
 	{ u8"&\t&", "&-&AAk-&-" },
 	{ u8"ä", "&AOQ-" },
 	{ u8"\x83\x84", NULL },
 	{ u8"\xc3\xc4", NULL },
 	{ u8"\xc3", NULL },
 	{ u8"äö", "&AOQA9g-" },
 	{ u8"äöü", "&AOQA9gD8-" },
 	{ u8"Ḁ", "&HgA-" },
 	{ u8"\xe1\xc8\x80", NULL },
 	{ u8"\xe1\xb8\xf0", NULL },
 	{ u8"\xe1\xb8", NULL },
 	{ u8"\xe1", NULL },
 	{ u8"Ḁḁ", "&HgAeAQ-" },
 	{ u8"😂", "&2D3eAg-" },
 	{ u8"\xf8\x9f\x98\x82", NULL },
 	{ u8"\xf0\xcf\x98\x82", NULL },
 	{ u8"\xf0\x9f\xd8\x82", NULL },
 	{ u8"\xf0\x9f\x98\xe2", NULL },
 	{ u8"\xf0\x9f\x98", NULL },
 	{ u8"\xf0\x9f", NULL },
 	{ u8"\xf0", NULL },
 	{ NULL, "&2D0-" },
 	{ u8"😈😎", "&2D3eCNg93g4-" },
 	{ u8"müll", "m&APw-ll" },
 	{ u8"mü", "m&APw-" },
 	{ u8"über", "&APw-ber" },
 };
 int
 main( void )
 {
 	int ret = 0;
 	for (uint i = 0; i < as(data); i++) {
 		if (!data[i].utf8)
 			continue;
 		xprintf( "To UTF-7 \"%s\" (\"%!s\") ...\n", data[i].utf8, data[i].utf8 );
 		char *utf7 = imap_utf8_to_utf7( data[i].utf8 );
 		if (utf7) {
 			if (!data[i].utf7) {
 				xprintf( "Unexpected success: \"%s\" (\"%!s\")\n", utf7, utf7 );
 				ret = 1;
 			} else if (strcmp( utf7, data[i].utf7 )) {
 				xprintf( "Mismatch, got \"%s\" (\"%!s\"), want \"%!s\"\n",
 				         utf7, utf7, data[i].utf7 );
 				ret = 1;
 			}
 			free( utf7 );
 		} else {
 			if (data[i].utf7) {
 				xprintf( "Conversion failure.\n" );
 				ret = 1;
 			}
 		}
 	}
 	for (uint i = 0; i < as(data); i++) {
 		if (!data[i].utf7)
 			continue;
 		xprintf( "From UTF-7 \"%!s\" ...\n", data[i].utf7 );
 		int utf7len = strlen( data[i].utf7 );
 		char utf8buf[1000];
 		int utf8len = imap_utf7_to_utf8( data[i].utf7, utf7len, utf8buf );
 		if (utf8len >= 0) {
 			if (!data[i].utf8) {
 				xprintf( "Unexpected success: \"%.*s\" (\"%.*!s\")\n",
 				         utf8len, utf8buf, utf8len, utf8buf );
 				ret = 1;
 			} else {
 				int wantlen = strlen( data[i].utf8 );
 				if (utf8len != wantlen || memcmp( utf8buf, data[i].utf8, utf8len )) {
 					xprintf( "Mismatch, got \"%.*s\" (\"%.*!s\"), want \"%s\" (\"%!s\")\n",
 					         utf8len, utf8buf, utf8len, utf8buf, data[i].utf8, data[i].utf8 );
 					ret = 1;
 				}
 			}
 			assert( utf8len < utf7len * 9 / 8 + 1 );
 		} else {
 			if (data[i].utf8) {
 				xprintf( "Conversion failure.\n" );
 				ret = 1;
 			}
 		}
 	}
 	return ret;
 }