add support for UTF-7 mailbox names

this finally makes us compliant with IMAP4rev1. how fitting that the
meanwhile released IMAP4rev2 demoted UTF-7 to legacy status ...

based on a patch by Georgy Kibardin <georgy@kibardin.name>.
This commit is contained in:
Oswald Buddenhagen 2022-05-23 10:12:38 +02:00
parent efab63fb8e
commit 52c063fd45
7 changed files with 434 additions and 4 deletions

2
TODO
View File

@ -6,8 +6,6 @@ automatically resume upon transient errors, e.g. "connection reset by peer"
or timeout after some data was already transmitted. or timeout after some data was already transmitted.
possibly also try to handle Exchange's "glitches" somehow. possibly also try to handle Exchange's "glitches" somehow.
add support for IMAP UTF-7 (for internationalized mailbox names).
uidvalidity lock timeout handling would be a good idea. uidvalidity lock timeout handling would be a good idea.
should complain when multiple Channels match the same folders. should complain when multiple Channels match the same folders.

1
src/.gitignore vendored
View File

@ -3,6 +3,7 @@
/mbsync /mbsync
/mdconvert /mdconvert
/tst_imap_msgs /tst_imap_msgs
/tst_imap_utf7
/tst_msg_cvt /tst_msg_cvt
/tst_timers /tst_timers
/tmp /tmp

View File

@ -5,7 +5,7 @@
mbsync_SOURCES = \ mbsync_SOURCES = \
util.c config.c socket.c \ util.c config.c socket.c \
driver.c drv_proxy.c \ driver.c drv_proxy.c \
drv_imap.c imap_msgs.c \ drv_imap.c imap_msgs.c imap_utf7.c \
drv_maildir.c \ drv_maildir.c \
sync.c sync_state.c sync_msg_cvt.c \ sync.c sync_state.c sync_msg_cvt.c \
main.c main_sync.c main_list.c main.c main_sync.c main_list.c
@ -54,10 +54,12 @@ man_MANS = mbsync.1 $(mdconvert_man)
tst_imap_msgs_SOURCES = tst_imap_msgs.c imap_msgs.c util.c tst_imap_msgs_SOURCES = tst_imap_msgs.c imap_msgs.c util.c
tst_imap_utf7_SOURCES = tst_imap_utf7.c imap_utf7.c util.c
tst_msg_cvt_SOURCES = tst_msg_cvt.c sync_msg_cvt.c util.c tst_msg_cvt_SOURCES = tst_msg_cvt.c sync_msg_cvt.c util.c
tst_msg_cvt_CFLAGS = -DQPRINTF_BUFF=10000 tst_msg_cvt_CFLAGS = -DQPRINTF_BUFF=10000
check_PROGRAMS = tst_imap_msgs tst_msg_cvt check_PROGRAMS = tst_imap_msgs tst_imap_utf7 tst_msg_cvt
TESTS = $(check_PROGRAMS) TESTS = $(check_PROGRAMS)
tst_timers_SOURCES = tst_timers.c util.c tst_timers_SOURCES = tst_timers.c util.c

View File

@ -1577,6 +1577,7 @@ list3_rsp_atom( imap_store_t *ctx, char *arg, uint len, int type ATTR_UNUSED )
string_list_t *narg; string_list_t *narg;
int argl = (int)len; int argl = (int)len;
uint l; uint l;
char rarg[1130]; // See imap_utf7_to_utf8() for the origin of that number
if (!arg) if (!arg)
return LIST_BAD; return LIST_BAD;
@ -1608,6 +1609,16 @@ list3_rsp_atom( imap_store_t *ctx, char *arg, uint len, int type ATTR_UNUSED )
} }
if (argl >= 5 && !memcmp( arg + argl - 5, ".lock", 5 )) /* workaround broken servers */ if (argl >= 5 && !memcmp( arg + argl - 5, ".lock", 5 )) /* workaround broken servers */
return LIST_OK; return LIST_OK;
if (!(CAP(UTF8_ACCEPT) || CAP(UTF8_ONLY))) {
int rargl = imap_utf7_to_utf8( arg, argl, rarg );
if (rargl < 0) {
error( "IMAP error: invalid modified-UTF-7 string '%.*s'.\n", argl, arg );
return LIST_BAD;
}
assert( (uint)rargl < sizeof(rarg) );
arg = rarg;
argl = rargl;
}
if (map_name( arg, argl, (char **)&narg, offsetof(string_list_t, string), ctx->delimiter, "/") < 0) { if (map_name( arg, argl, (char **)&narg, offsetof(string_list_t, string), ctx->delimiter, "/") < 0) {
warn( "IMAP warning: ignoring mailbox %.*s (reserved character '/' in name)\n", argl, arg ); warn( "IMAP warning: ignoring mailbox %.*s (reserved character '/' in name)\n", argl, arg );
return LIST_OK; return LIST_OK;
@ -1665,6 +1676,16 @@ prepare_name( char **buf, const imap_store_t *ctx, const char *prefix, const cha
return -1; return -1;
default: default:
memcpy( *buf, prefix, pl ); memcpy( *buf, prefix, pl );
if (!(CAP(UTF8_ACCEPT) || CAP(UTF8_ONLY))) {
char *nbuf = imap_utf8_to_utf7( *buf );
if (!nbuf) {
error( "IMAP error: invalid UTF-8 string '%s'\n", *buf );
free( *buf );
return -1;
}
free( *buf );
*buf = nbuf;
}
return 0; return 0;
} }
} }

View File

@ -10,6 +10,7 @@
#include "driver.h" #include "driver.h"
//#define DEBUG_IMAP_MSGS //#define DEBUG_IMAP_MSGS
//#define DEBUG_IMAP_UTF7
typedef union imap_message { typedef union imap_message {
message_t gen; message_t gen;
@ -45,4 +46,7 @@ void reset_imap_messages( imap_messages_t *msgs );
void imap_ensure_relative( imap_messages_t *msgs ); void imap_ensure_relative( imap_messages_t *msgs );
void imap_ensure_absolute( imap_messages_t *msgs ); void imap_ensure_absolute( imap_messages_t *msgs );
char *imap_utf8_to_utf7( const char *buf );
int imap_utf7_to_utf8( const char *buf, int argl, char *outbuf );
#endif #endif

288
src/imap_utf7.c Normal file
View File

@ -0,0 +1,288 @@
// SPDX-FileCopyrightText: 2018-2021 Georgy Kibardin <georgy@kibardin.name>
// SPDX-FileCopyrightText: 2022 Oswald Buddenhagen <ossi@users.sf.net>
// SPDX-License-Identifier: GPL-2.0-or-later WITH LicenseRef-isync-GPL-exception
//
// mbsync - mailbox synchronizer
//
#include "imap_p.h"
#ifdef DEBUG_IMAP_UTF7
# define dbg(...) print(__VA_ARGS__)
#else
# define dbg(...) do { } while (0)
#endif
struct bit_fifo {
unsigned long long value;
uint bits;
};
static void
add_bits( struct bit_fifo *fifo, uint bits, uint size )
{
fifo->value = (fifo->value << size) | bits;
fifo->bits += size;
assert( fifo->bits <= sizeof(fifo->value) * 8 );
}
static uint
eat_bits( struct bit_fifo *fifo, uint size )
{
fifo->bits -= size;
return (fifo->value >> fifo->bits) & ((1LL << size) - 1);
}
static uint
peek_bits( struct bit_fifo *fifo, uint size )
{
return (fifo->value >> (fifo->bits - size)) & ((1LL << size) - 1);
}
static void
add_char( char **p, uint chr )
{
*((*p)++) = (char)chr;
}
static uchar
eat_char( const char **p )
{
return (uchar)*((*p)++);
}
static uint
read_as_utf8( const char **utf8_buf_p )
{
uchar chr = eat_char( utf8_buf_p );
if (chr < 0x80)
return chr;
if ((chr & 0xf8) == 0xf0) {
uchar chr2 = eat_char( utf8_buf_p );
if ((chr2 & 0xc0) != 0x80)
return ~0;
uchar chr3 = eat_char( utf8_buf_p );
if ((chr3 & 0xc0) != 0x80)
return ~0;
uchar chr4 = eat_char( utf8_buf_p );
if ((chr4 & 0xc0) != 0x80)
return ~0;
return ((chr & 0x7) << 18) |
((chr2 & 0x3f) << 12) |
((chr3 & 0x3f) << 6) |
(chr4 & 0x3f);
}
if ((chr & 0xf0) == 0xe0) {
uchar chr2 = eat_char( utf8_buf_p );
if ((chr2 & 0xc0) != 0x80)
return ~0;
uchar chr3 = eat_char( utf8_buf_p );
if ((chr3 & 0xc0) != 0x80)
return ~0;
return ((chr & 0xf) << 12) |
((chr2 & 0x3f) << 6) |
(chr3 & 0x3f);
}
if ((chr & 0xe0) == 0xc0) {
uchar chr2 = eat_char( utf8_buf_p );
if ((chr2 & 0xc0) != 0x80)
return ~0;
return (chr & 0x1f) << 6 |
(chr2 & 0x3f);
}
return ~0;
}
static int
needs_encoding( uint chr )
{
return chr && (chr <= 0x1f || chr >= 0x7f);
}
static uint
utf16_encode( uint chr )
{
chr -= 0x10000;
return (((chr >> 10) + 0xd800) << 16) | ((chr & 0x3ff) + 0xdc00);
}
static uchar
b64_encode( uint chr )
{
assert( chr <= 0x3f );
return "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"[chr];
}
char *
imap_utf8_to_utf7( const char *buf )
{
// Size requirements:
// - pass-through: l, 1 => 1
// - all "&": l * 2, 1 => 2
// - 7-bit: (l * 2 * 4 + 2) / 3 + 2, ~ l * 2.7, 1 => 5
// - 3-octet: (l / 3 * 2 * 4 + 2) / 3 + 2, ~ l * 0.9, 3 => 5
// - 4-octet: (l / 4 * 2 * 2 * 4 + 2) / 3 + 2, ~ l * 1.3, 4 => 8
// => worst case: "&" and 7-bit alternating: l * 3.5, 2 => 7
int outsz = strlen( buf ) * 7 / 2 + 3;
char *result = nfmalloc( outsz );
char *outp = result;
struct bit_fifo fifo = { 0, 0 };
int encoding = 0;
uint chr;
do {
chr = read_as_utf8( &buf );
if (chr == ~0U) {
dbg( "Error: invalid UTF-8 string\n" );
free( result );
return NULL;
}
if (needs_encoding( chr )) {
if (!encoding) {
add_char( &outp, '&' );
encoding = 1;
}
if (chr <= 0xffff)
add_bits( &fifo, chr, 16 );
else
add_bits( &fifo, utf16_encode( chr ), 32 );
while (fifo.bits >= 6)
add_char( &outp, b64_encode( eat_bits( &fifo, 6 ) ) );
} else {
if (encoding) {
if (fifo.bits) {
uint trailing_bits = 6 - fifo.bits;
uchar trail = b64_encode( eat_bits( &fifo, fifo.bits ) << trailing_bits );
add_char( &outp, trail );
}
add_char( &outp, '-' );
encoding = 0;
}
add_char( &outp, chr );
if (chr == '&')
add_char( &outp, '-' );
}
} while (chr);
assert( (int)(outp - result) <= outsz );
return result;
}
static void
write_as_utf8( char **outp, uint chr )
{
if (chr <= 0x7f) {
add_char( outp, chr );
} else if (chr <= 0x7ff) {
add_char( outp, (chr >> 6) | 0xc0 );
add_char( outp, (chr & 0x3f) | 0x80 );
} else if (chr <= 0xffff) {
add_char( outp, (chr >> 12) | 0xe0 );
add_char( outp, ((chr >> 6) & 0x3f) | 0x80 );
add_char( outp, (chr & 0x3f) | 0x80 );
} else {
assert( chr <= 0xfffff );
add_char( outp, (chr >> 18) | 0xf0 );
add_char( outp, ((chr >> 12) & 0x3f) | 0x80 );
add_char( outp, ((chr >> 6) & 0x3f) | 0x80 );
add_char( outp, (chr & 0x3f) | 0x80 );
}
}
static int
need_another_16bit( uint bits )
{
return (bits & 0xfc00) == 0xd800;
}
static uint
utf16_decode( uint subject )
{
return 0x10000 + (((subject >> 16) - 0xd800) << 10) + ((subject & 0xffff) - 0xdc00);
}
static uint
b64_decode( uchar chr )
{
static uint lu[128] = {
~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0,
~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0,
~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0, 62, 63, ~0, ~0, ~0,
52, 53, 54, 55, 56, 57, 58, 59, 60, 61, ~0, ~0, ~0, ~0, ~0, ~0,
~0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, ~0, ~0, ~0, ~0, ~0,
~0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ~0, ~0, ~0, ~0, ~0,
};
return lu[chr];
}
int
imap_utf7_to_utf8( const char *buf, int bufl, char *outbuf )
{
// Size requirements:
// - pass-through: l (shortest worst case)
// - all "&": l / 2, 2 => 1, * .5
// - 7-bit: ((l - 2) * 3 + 1) / 4 / 2, ~ l * .38, 5 => 1, * .2
// - 3-octet: ((l - 2) * 3 + 1) / 4 / 2 * 3, ~ l * 1.13, 5 => 3, * .6 (generic worst case)
// - 4-octet: ((l - 2) * 3 + 1) / 4 / 2 / 2 * 4, ~ l * .75, 8 => 4, * .5
// => reserve bufl * 9 / 8
char *outp = outbuf;
struct bit_fifo fifo = { 0, 0 };
const char *bufe = buf + bufl;
while (buf != bufe) {
uchar chr = *buf++;
if (chr != '&') {
if (chr & 0x80) {
dbg( "Error: 8-bit char %x\n", chr );
return -1;
}
add_char( &outp, chr );
continue;
}
if (buf == bufe) {
dbg( "Error: unterminated shift sequence\n" );
return -1;
}
chr = *buf++;
if (chr == '-') {
add_char( &outp, '&' );
continue;
}
fifo.bits = 0;
do {
if (chr & 0x80) {
dbg( "Error: 8-bit char %x\n", chr );
return -1;
}
uint bits = b64_decode( chr );
if (bits == ~0U) {
dbg( "Error: char %x outside alphabet\n", chr );
return -1;
}
add_bits( &fifo, bits, 6 );
if (fifo.bits >= 16) {
if (need_another_16bit( peek_bits( &fifo, 16 ) )) {
if (fifo.bits >= 32) {
uint utf16 = eat_bits( &fifo, 32 );
if ((utf16 & 0xfc00) != 0xdc00) {
dbg( "Error: unpaired UTF-16 surrogate\n" );
return -1;
}
write_as_utf8( &outp, utf16_decode( utf16 ) );
}
} else {
write_as_utf8( &outp, eat_bits( &fifo, 16 ) );
}
}
if (buf == bufe) {
dbg( "Error: unterminated shift sequence\n" );
return -1;
}
chr = *buf++;
} while (chr != '-');
if (fifo.bits > 6) {
dbg( "Error: incomplete code point\n" );
return -1;
}
}
return (int)(outp - outbuf);
}

116
src/tst_imap_utf7.c Normal file
View File

@ -0,0 +1,116 @@
// SPDX-FileCopyrightText: 2022 Oswald Buddenhagen <ossi@users.sf.net>
// SPDX-License-Identifier: GPL-2.0-or-later
//
// isync test suite
//
#include "imap_p.h"
static struct {
const char *utf8, *utf7;
} data[] = {
{ u8"", "" },
{ u8"1", "1" },
{ u8"word", "word" },
{ u8"&", "&-" },
{ NULL, "&" },
{ NULL, "&-&" },
{ u8"&&", "&-&-" },
{ u8"1&1", "1&-1" },
{ u8"&1&", "&-1&-" },
{ u8"\t", "&AAk-" },
{ NULL, "&AAk" },
{ NULL, "&AA-" },
{ NULL, "&*Ak-" },
{ NULL, "&&-" },
{ u8"m\x7f""ll", "m&AH8-ll" },
{ u8"\t&", "&AAk-&-" },
{ u8"\t&\t", "&AAk-&-&AAk-" },
{ u8"&\t", "&-&AAk-" },
{ u8"&\t&", "&-&AAk-&-" },
{ u8"ä", "&AOQ-" },
{ u8"\x83\x84", NULL },
{ u8"\xc3\xc4", NULL },
{ u8"\xc3", NULL },
{ u8"äö", "&AOQA9g-" },
{ u8"äöü", "&AOQA9gD8-" },
{ u8"", "&HgA-" },
{ u8"\xe1\xc8\x80", NULL },
{ u8"\xe1\xb8\xf0", NULL },
{ u8"\xe1\xb8", NULL },
{ u8"\xe1", NULL },
{ u8"Ḁḁ", "&HgAeAQ-" },
{ u8"😂", "&2D3eAg-" },
{ u8"\xf8\x9f\x98\x82", NULL },
{ u8"\xf0\xcf\x98\x82", NULL },
{ u8"\xf0\x9f\xd8\x82", NULL },
{ u8"\xf0\x9f\x98\xe2", NULL },
{ u8"\xf0\x9f\x98", NULL },
{ u8"\xf0\x9f", NULL },
{ u8"\xf0", NULL },
{ NULL, "&2D0-" },
{ u8"😈😎", "&2D3eCNg93g4-" },
{ u8"müll", "m&APw-ll" },
{ u8"", "m&APw-" },
{ u8"über", "&APw-ber" },
};
int
main( void )
{
int ret = 0;
for (uint i = 0; i < as(data); i++) {
if (!data[i].utf8)
continue;
xprintf( "To UTF-7 \"%s\" (\"%!s\") ...\n", data[i].utf8, data[i].utf8 );
char *utf7 = imap_utf8_to_utf7( data[i].utf8 );
if (utf7) {
if (!data[i].utf7) {
xprintf( "Unexpected success: \"%s\" (\"%!s\")\n", utf7, utf7 );
ret = 1;
} else if (strcmp( utf7, data[i].utf7 )) {
xprintf( "Mismatch, got \"%s\" (\"%!s\"), want \"%!s\"\n",
utf7, utf7, data[i].utf7 );
ret = 1;
}
free( utf7 );
} else {
if (data[i].utf7) {
xprintf( "Conversion failure.\n" );
ret = 1;
}
}
}
for (uint i = 0; i < as(data); i++) {
if (!data[i].utf7)
continue;
xprintf( "From UTF-7 \"%!s\" ...\n", data[i].utf7 );
int utf7len = strlen( data[i].utf7 );
char utf8buf[1000];
int utf8len = imap_utf7_to_utf8( data[i].utf7, utf7len, utf8buf );
if (utf8len >= 0) {
if (!data[i].utf8) {
xprintf( "Unexpected success: \"%.*s\" (\"%.*!s\")\n",
utf8len, utf8buf, utf8len, utf8buf );
ret = 1;
} else {
int wantlen = strlen( data[i].utf8 );
if (utf8len != wantlen || memcmp( utf8buf, data[i].utf8, utf8len )) {
xprintf( "Mismatch, got \"%.*s\" (\"%.*!s\"), want \"%s\" (\"%!s\")\n",
utf8len, utf8buf, utf8len, utf8buf, data[i].utf8, data[i].utf8 );
ret = 1;
}
}
assert( utf8len < utf7len * 9 / 8 + 1 );
} else {
if (data[i].utf8) {
xprintf( "Conversion failure.\n" );
ret = 1;
}
}
}
return ret;
}