[PATCH] NLS: update handling of Unicode

From: Alan Stern
Date: Thu Apr 30 2009 - 10:08:47 EST


This patch (as1239) updates the kernel's treatment of Unicode. The
character-set conversion routines are well behind the current state of
the Unicode specification: They don't recognize the existence of code
points beyond plane 0 or of surrogate pairs in the UTF-16 encoding.

The old wchar_t 16-bit type is retained because it's still used in
lots of places. This shouldn't cause any new problems; if a
conversion now results in an invalid 16-bit code then before it must
have yielded an undefined code.

Difficult-to-read names like "utf_mbstowcs" are replaced with more
transparent names like "utf8s_to_utf16s" and the ordering of the
parameters is rationalized (buffer lengths come immediate after the
pointers they refer to, and the inputs precede the outputs).
Fortunately the low-level conversion routines are used in only a few
places; the interfaces to the higher-level uni2char and char2uni
methods have been left unchanged.

Signed-off-by: Alan Stern <stern@xxxxxxxxxxxxxxxxxxx>
Acked-by: Clemens Ladisch <clemens@xxxxxxxxxx>

---

This is meant to be applied on top of Clemens's 3-patch series.



Index: usb-2.6/include/linux/nls.h
===================================================================
--- usb-2.6.orig/include/linux/nls.h
+++ usb-2.6/include/linux/nls.h
@@ -3,8 +3,23 @@

#include <linux/init.h>

-/* unicode character */
-typedef __u16 wchar_t;
+/* Unicode has changed over the years. Unicode code points no longer
+ * fit into 16 bits; as of Unicode 5 valid code points range from 0
+ * to 0x10ffff (17 planes, where each plane holds 65536 code points).
+ *
+ * The original decision to represent Unicode characters as 16-bit
+ * wchar_t values is now outdated. But plane 0 still includes the
+ * most commonly used characters, so we will retain it. The newer
+ * 32-bit unicode_t type can be used when it is necessary to
+ * represent the full Unicode character set.
+ */
+
+/* Plane-0 Unicode character */
+typedef u16 wchar_t;
+#define MAX_WCHAR_T 0xffff
+
+/* Arbitrary Unicode character */
+typedef u32 unicode_t;

struct nls_table {
const char *charset;
@@ -21,6 +36,13 @@ struct nls_table {
/* this value hold the maximum octet of charset */
#define NLS_MAX_CHARSET_SIZE 6 /* for UTF-8 */

+/* Byte order for UTF-16 strings */
+enum utf16_endian {
+ UTF16_HOST_ENDIAN,
+ UTF16_LITTLE_ENDIAN,
+ UTF16_BIG_ENDIAN
+};
+
/* nls.c */
extern int register_nls(struct nls_table *);
extern int unregister_nls(struct nls_table *);
@@ -28,10 +50,11 @@ extern struct nls_table *load_nls(char *
extern void unload_nls(struct nls_table *);
extern struct nls_table *load_nls_default(void);

-extern int utf8_mbtowc(wchar_t *, const __u8 *, int);
-extern int utf8_mbstowcs(wchar_t *, const __u8 *, int);
-extern int utf8_wctomb(__u8 *, wchar_t, int);
-extern int utf8_wcstombs(__u8 *, const wchar_t *, int);
+extern int utf8_to_utf32(const u8 *s, int len, unicode_t *pu);
+extern int utf32_to_utf8(unicode_t u, u8 *s, int maxlen);
+extern int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs);
+extern int utf16s_to_utf8s(const wchar_t *pwcs, int len,
+ enum utf16_endian endian, u8 *s, int maxlen);

static inline unsigned char nls_tolower(struct nls_table *t, unsigned char c)
{
Index: usb-2.6/fs/nls/nls_base.c
===================================================================
--- usb-2.6.orig/fs/nls/nls_base.c
+++ usb-2.6/fs/nls/nls_base.c
@@ -15,6 +15,7 @@
#include <linux/errno.h>
#include <linux/kmod.h>
#include <linux/spinlock.h>
+#include <asm/byteorder.h>

static struct nls_table default_table;
static struct nls_table *tables = &default_table;
@@ -43,10 +44,17 @@ static const struct utf8_table utf8_tabl
{0, /* end of table */}
};

-int
-utf8_mbtowc(wchar_t *p, const __u8 *s, int n)
+#define UNICODE_MAX 0x0010ffff
+#define PLANE_SIZE 0x00010000
+
+#define SURROGATE_MASK 0xfffff800
+#define SURROGATE_PAIR 0x0000d800
+#define SURROGATE_LOW 0x00000400
+#define SURROGATE_BITS 0x000003ff
+
+int utf8_to_utf32(const u8 *s, int len, unicode_t *pu)
{
- long l;
+ unsigned long l;
int c0, c, nc;
const struct utf8_table *t;

@@ -57,12 +65,13 @@ utf8_mbtowc(wchar_t *p, const __u8 *s, i
nc++;
if ((c0 & t->cmask) == t->cval) {
l &= t->lmask;
- if (l < t->lval)
+ if (l < t->lval || l > UNICODE_MAX ||
+ (l & SURROGATE_MASK) == SURROGATE_PAIR)
return -1;
- *p = l;
+ *pu = (unicode_t) l;
return nc;
}
- if (n <= nc)
+ if (len <= nc)
return -1;
s++;
c = (*s ^ 0x80) & 0xFF;
@@ -72,76 +81,119 @@ utf8_mbtowc(wchar_t *p, const __u8 *s, i
}
return -1;
}
+EXPORT_SYMBOL(utf8_to_utf32);

-int
-utf8_mbstowcs(wchar_t *pwcs, const __u8 *s, int n)
+int utf32_to_utf8(unicode_t u, u8 *s, int maxlen)
{
- __u16 *op;
- const __u8 *ip;
- int size;
-
- op = pwcs;
- ip = s;
- while (*ip && n > 0) {
- if (*ip & 0x80) {
- size = utf8_mbtowc(op, ip, n);
- if (size == -1) {
- /* Ignore character and move on */
- ip++;
- n--;
- } else {
- op++;
- ip += size;
- n -= size;
- }
- } else {
- *op++ = *ip++;
- n--;
- }
- }
- return (op - pwcs);
-}
-
-int
-utf8_wctomb(__u8 *s, wchar_t wc, int maxlen)
-{
- long l;
+ unsigned long l;
int c, nc;
const struct utf8_table *t;
-
+
if (!s)
return 0;
-
- l = wc;
+
+ l = u;
+ if (l > UNICODE_MAX || (l & SURROGATE_MASK) == SURROGATE_PAIR)
+ return -1;
+
nc = 0;
for (t = utf8_table; t->cmask && maxlen; t++, maxlen--) {
nc++;
if (l <= t->lmask) {
c = t->shift;
- *s = t->cval | (l >> c);
+ *s = (u8) (t->cval | (l >> c));
while (c > 0) {
c -= 6;
s++;
- *s = 0x80 | ((l >> c) & 0x3F);
+ *s = (u8) (0x80 | ((l >> c) & 0x3F));
}
return nc;
}
}
return -1;
}
+EXPORT_SYMBOL(utf32_to_utf8);

-int
-utf8_wcstombs(__u8 *s, const wchar_t *pwcs, int maxlen)
+int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs)
{
- const __u16 *ip;
- __u8 *op;
+ u16 *op;
int size;
+ unicode_t u;
+
+ op = pwcs;
+ while (*s && len > 0) {
+ if (*s & 0x80) {
+ size = utf8_to_utf32(s, len, &u);
+ if (size < 0) {
+ /* Ignore character and move on */
+ size = 1;
+ } else if (u >= PLANE_SIZE) {
+ u -= PLANE_SIZE;
+ *op++ = (wchar_t) (SURROGATE_PAIR |
+ ((u >> 10) & SURROGATE_BITS));
+ *op++ = (wchar_t) (SURROGATE_PAIR |
+ SURROGATE_LOW |
+ (u & SURROGATE_BITS));
+ } else {
+ *op++ = (wchar_t) u;
+ }
+ s += size;
+ len -= size;
+ } else {
+ *op++ = *s++;
+ len--;
+ }
+ }
+ return op - pwcs;
+}
+EXPORT_SYMBOL(utf8s_to_utf16s);
+
+static inline unsigned long get_utf16(unsigned c, enum utf16_endian endian)
+{
+ switch (endian) {
+ default:
+ return c;
+ case UTF16_LITTLE_ENDIAN:
+ return __le16_to_cpu(c);
+ case UTF16_BIG_ENDIAN:
+ return __be16_to_cpu(c);
+ }
+}
+
+int utf16s_to_utf8s(const wchar_t *pwcs, int len, enum utf16_endian endian,
+ u8 *s, int maxlen)
+{
+ u8 *op;
+ int size;
+ unsigned long u, v;

op = s;
- ip = pwcs;
- while (*ip && maxlen > 0) {
- if (*ip > 0x7f) {
- size = utf8_wctomb(op, *ip, maxlen);
+ while (len > 0 && maxlen > 0) {
+ u = get_utf16(*pwcs, endian);
+ if (!u)
+ break;
+ pwcs++;
+ len--;
+ if (u > 0x7f) {
+ if ((u & SURROGATE_MASK) == SURROGATE_PAIR) {
+ if (u & SURROGATE_LOW) {
+ /* Ignore character and move on */
+ continue;
+ }
+ if (len <= 0)
+ break;
+ v = get_utf16(*pwcs, endian);
+ if ((v & SURROGATE_MASK) != SURROGATE_PAIR ||
+ !(v & SURROGATE_LOW)) {
+ /* Ignore character and move on */
+ continue;
+ }
+ u = PLANE_SIZE + ((u & SURROGATE_BITS) << 10)
+ + (v & SURROGATE_BITS);
+ pwcs++;
+ len--;
+ }
+ size = utf32_to_utf8(u, op, maxlen);
if (size == -1) {
/* Ignore character and move on */
} else {
@@ -149,13 +201,13 @@ utf8_wcstombs(__u8 *s, const wchar_t *pw
maxlen -= size;
}
} else {
- *op++ = (__u8) *ip;
+ *op++ = (u8) u;
maxlen--;
}
- ip++;
}
- return (op - s);
+ return op - s;
}
+EXPORT_SYMBOL(utf16s_to_utf8s);

int register_nls(struct nls_table * nls)
{
@@ -467,9 +519,5 @@ EXPORT_SYMBOL(unregister_nls);
EXPORT_SYMBOL(unload_nls);
EXPORT_SYMBOL(load_nls);
EXPORT_SYMBOL(load_nls_default);
-EXPORT_SYMBOL(utf8_mbtowc);
-EXPORT_SYMBOL(utf8_mbstowcs);
-EXPORT_SYMBOL(utf8_wctomb);
-EXPORT_SYMBOL(utf8_wcstombs);

MODULE_LICENSE("Dual BSD/GPL");
Index: usb-2.6/fs/nls/nls_utf8.c
===================================================================
--- usb-2.6.orig/fs/nls/nls_utf8.c
+++ usb-2.6/fs/nls/nls_utf8.c
@@ -15,7 +15,11 @@ static int uni2char(wchar_t uni, unsigne
{
int n;

- if ( (n = utf8_wctomb(out, uni, boundlen)) == -1) {
+ if (boundlen <= 0)
+ return -ENAMETOOLONG;
+
+ n = utf32_to_utf8(uni, out, boundlen);
+ if (n < 0) {
*out = '?';
return -EINVAL;
}
@@ -25,11 +29,14 @@ static int uni2char(wchar_t uni, unsigne
static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
{
int n;
+ unicode_t u;

- if ( (n = utf8_mbtowc(uni, rawstring, boundlen)) == -1) {
+ n = utf8_to_utf32(rawstring, boundlen, &u);
+ if (n < 0 || u > MAX_WCHAR_T) {
*uni = 0x003f; /* ? */
- n = -EINVAL;
+ return -EINVAL;
}
+ *uni = (wchar_t) u;
return n;
}

Index: usb-2.6/fs/befs/linuxvfs.c
===================================================================
--- usb-2.6.orig/fs/befs/linuxvfs.c
+++ usb-2.6/fs/befs/linuxvfs.c
@@ -513,7 +513,7 @@ befs_utf2nls(struct super_block *sb, con
{
struct nls_table *nls = BEFS_SB(sb)->nls;
int i, o;
- wchar_t uni;
+ unicode_t uni;
int unilen, utflen;
char *result;
/* The utf8->nls conversion won't make the final nls string bigger
@@ -539,16 +539,16 @@ befs_utf2nls(struct super_block *sb, con
for (i = o = 0; i < in_len; i += utflen, o += unilen) {

/* convert from UTF-8 to Unicode */
- utflen = utf8_mbtowc(&uni, &in[i], in_len - i);
- if (utflen < 0) {
+ utflen = utf8_to_utf32(&in[i], in_len - i, &uni);
+ if (utflen < 0)
goto conv_err;
- }

/* convert from Unicode to nls */
+ if (uni > MAX_WCHAR_T)
+ goto conv_err;
unilen = nls->uni2char(uni, &result[o], in_len - o);
- if (unilen < 0) {
+ if (unilen < 0)
goto conv_err;
- }
}
result[o] = '\0';
*out_len = o;
@@ -619,15 +619,13 @@ befs_nls2utf(struct super_block *sb, con

/* convert from nls to unicode */
unilen = nls->char2uni(&in[i], in_len - i, &uni);
- if (unilen < 0) {
+ if (unilen < 0)
goto conv_err;
- }

/* convert from unicode to UTF-8 */
- utflen = utf8_wctomb(&result[o], uni, 3);
- if (utflen <= 0) {
+ utflen = utf32_to_utf8(uni, &result[o], 3);
+ if (utflen <= 0)
goto conv_err;
- }
}

result[o] = '\0';
Index: usb-2.6/fs/fat/dir.c
===================================================================
--- usb-2.6.orig/fs/fat/dir.c
+++ usb-2.6/fs/fat/dir.c
@@ -22,6 +22,19 @@
#include <asm/uaccess.h>
#include "fat.h"

+/*
+ * Maximum buffer size of short name.
+ * [(MSDOS_NAME + '.') * max one char + nul]
+ * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul]
+ */
+#define FAT_MAX_SHORT_SIZE ((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1)
+/*
+ * Maximum buffer size of unicode chars from slots.
+ * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)]
+ */
+#define FAT_MAX_UNI_CHARS ((MSDOS_SLOTS - 1) * 13 + 1)
+#define FAT_MAX_UNI_SIZE (FAT_MAX_UNI_CHARS * sizeof(wchar_t))
+
static inline loff_t fat_make_i_pos(struct super_block *sb,
struct buffer_head *bh,
struct msdos_dir_entry *de)
@@ -171,7 +184,8 @@ static inline int fat_uni_to_x8(struct m
unsigned char *buf, int size)
{
if (sbi->options.utf8)
- return utf8_wcstombs(buf, uni, size);
+ return utf16s_to_utf8s(uni, FAT_MAX_UNI_CHARS,
+ UTF16_HOST_ENDIAN, buf, size);
else
return uni16_to_x8(buf, uni, size, sbi->options.unicode_xlate,
sbi->nls_io);
@@ -325,19 +339,6 @@ parse_long:
}

/*
- * Maximum buffer size of short name.
- * [(MSDOS_NAME + '.') * max one char + nul]
- * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul]
- */
-#define FAT_MAX_SHORT_SIZE ((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1)
-/*
- * Maximum buffer size of unicode chars from slots.
- * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)]
- */
-#define FAT_MAX_UNI_CHARS ((MSDOS_SLOTS - 1) * 13 + 1)
-#define FAT_MAX_UNI_SIZE (FAT_MAX_UNI_CHARS * sizeof(wchar_t))
-
-/*
* Return values: negative -> error, 0 -> not found, positive -> found,
* value is the total amount of slots, including the shortname entry.
*/
Index: usb-2.6/fs/fat/namei_vfat.c
===================================================================
--- usb-2.6.orig/fs/fat/namei_vfat.c
+++ usb-2.6/fs/fat/namei_vfat.c
@@ -502,11 +502,11 @@ xlate_to_uni(const unsigned char *name,
if (utf8) {
int name_len = strlen(name);

- *outlen = utf8_mbstowcs((wchar_t *)outname, name, PATH_MAX);
+ *outlen = utf8s_to_utf16s(name, PATH_MAX, (wchar_t *) outname);

/*
* We stripped '.'s before and set len appropriately,
- * but utf8_mbstowcs doesn't care about len
+ * but utf8s_to_utf16s doesn't care about len
*/
*outlen -= (name_len - len);

Index: usb-2.6/fs/isofs/joliet.c
===================================================================
--- usb-2.6.orig/fs/isofs/joliet.c
+++ usb-2.6/fs/isofs/joliet.c
@@ -37,37 +37,6 @@ uni16_to_x8(unsigned char *ascii, __be16
return (op - ascii);
}

-/* Convert big endian wide character string to utf8 */
-static int
-wcsntombs_be(__u8 *s, const __u8 *pwcs, int inlen, int maxlen)
-{
- const __u8 *ip;
- __u8 *op;
- int size;
- __u16 c;
-
- op = s;
- ip = pwcs;
- while ((*ip || ip[1]) && (maxlen > 0) && (inlen > 0)) {
- c = (*ip << 8) | ip[1];
- if (c > 0x7f) {
- size = utf8_wctomb(op, c, maxlen);
- if (size == -1) {
- /* Ignore character and move on */
- maxlen--;
- } else {
- op += size;
- maxlen -= size;
- }
- } else {
- *op++ = (__u8) c;
- }
- ip += 2;
- inlen--;
- }
- return (op - s);
-}
-
int
get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, struct inode * inode)
{
@@ -79,8 +48,9 @@ get_joliet_filename(struct iso_directory
nls = ISOFS_SB(inode->i_sb)->s_nls_iocharset;

if (utf8) {
- len = wcsntombs_be(outname, de->name,
- de->name_len[0] >> 1, PAGE_SIZE);
+ len = utf16s_to_utf8s((const wchar_t *) de->name,
+ de->name_len[0] >> 1, UTF16_BIG_ENDIAN,
+ outname, PAGE_SIZE);
} else {
len = uni16_to_x8(outname, (__be16 *) de->name,
de->name_len[0] >> 1, nls);
Index: usb-2.6/fs/ncpfs/ncplib_kernel.c
===================================================================
--- usb-2.6.orig/fs/ncpfs/ncplib_kernel.c
+++ usb-2.6/fs/ncpfs/ncplib_kernel.c
@@ -1113,11 +1113,13 @@ ncp__io2vol(struct ncp_server *server, u

if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) {
int k;
+ unicode_t u;

- k = utf8_mbtowc(&ec, iname, iname_end - iname);
- if (k < 0)
+ k = utf8_to_utf32(iname, iname_end - iname, &u);
+ if (k < 0 || u > MAX_WCHAR_T)
return -EINVAL;
iname += k;
+ ec = u;
} else {
if (*iname == NCP_ESC) {
int k;
@@ -1214,7 +1216,7 @@ ncp__vol2io(struct ncp_server *server, u
if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) {
int k;

- k = utf8_wctomb(iname, ec, iname_end - iname);
+ k = utf32_to_utf8(ec, iname, iname_end - iname);
if (k < 0) {
err = -ENAMETOOLONG;
goto quit;
Index: usb-2.6/drivers/usb/core/message.c
===================================================================
--- usb-2.6.orig/drivers/usb/core/message.c
+++ usb-2.6/drivers/usb/core/message.c
@@ -780,14 +780,13 @@ int usb_string(struct usb_device *dev, i
{
unsigned char *tbuf;
int err;
- unsigned int u;

if (dev->state == USB_STATE_SUSPENDED)
return -EHOSTUNREACH;
if (size <= 0 || !buf || !index)
return -EINVAL;
buf[0] = 0;
- tbuf = kmalloc(256 + 2, GFP_NOIO);
+ tbuf = kmalloc(256, GFP_NOIO);
if (!tbuf)
return -ENOMEM;

@@ -814,12 +813,9 @@ int usb_string(struct usb_device *dev, i
if (err < 0)
goto errout;

- for (u = 2; u < err; u += 2)
- le16_to_cpus((u16 *)&tbuf[u]);
- tbuf[u] = 0;
- tbuf[u + 1] = 0;
size--; /* leave room for trailing NULL char in output buffer */
- err = utf8_wcstombs(buf, (u16 *)&tbuf[2], size);
+ err = utf16s_to_utf8s((wchar_t *) &tbuf[2], (err - 2) / 2,
+ UTF16_LITTLE_ENDIAN, buf, size);
buf[err] = 0;

if (tbuf[1] != USB_DT_STRING)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/