Re: [PATCH v2 04/10] fs/ntfs3: Add file operations and implementation

From: Pali Rohár
Date: Sun Aug 23 2020 - 05:49:02 EST


Hello Konstantin!

On Friday 21 August 2020 16:25:15 Konstantin Komarov wrote:
> diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c
> new file mode 100644
> index 000000000000..5f1105f1283c
> --- /dev/null
> +++ b/fs/ntfs3/dir.c
> @@ -0,0 +1,529 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * linux/fs/ntfs3/dir.c
> + *
> + * Copyright (C) 2019-2020 Paragon Software GmbH, All rights reserved.
> + *
> + * directory handling functions for ntfs-based filesystems
> + *
> + */
> +#include <linux/blkdev.h>
> +#include <linux/buffer_head.h>
> +#include <linux/fs.h>
> +#include <linux/iversion.h>
> +#include <linux/nls.h>
> +
> +#include "debug.h"
> +#include "ntfs.h"
> +#include "ntfs_fs.h"
> +
> +/*
> + * Convert little endian Unicode 16 to UTF-8.

I guess that by "Unicode 16" you mean UTF-16, right?

Anyway, comment is incorrect as function does not support UTF-16 nor
UTF-8. This function works only with UCS-2 encoding (not full UTD-16)
and converts input buffer to NLS encoding, not UTF-8. Moreover kernel's
NLS API does not support full UTF-8 and NLS's UTF-8 encoding is semi
broken and limited to just 3-byte sequences. Which means it does not
allow to access all UNICODE filenames.

So result is that comment for uni_to_x8 function is incorrect.

I would suggest to not use NLS API for encoding from/to UTF-8, but
rather use utf16s_to_utf8s() and utf8s_to_utf16s() functions.

See for example how it is implemented in exfat driver:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/fs/exfat/nls.c
Look for functions exfat_utf16_to_nls() and exfat_nls_to_utf16().

Ideally check if you can store character 💩 (Pile of Poo, U+1F4A9, does
not fit into 3byte UTF-8 sequence) into filename and if there is correct
interoperability between Windows and this new ntfs3 implementation.

> + */
> +int uni_to_x8(ntfs_sb_info *sbi, const struct le_str *uni, u8 *buf, int buf_len)
> +{
> + const __le16 *ip = uni->name;
> + u8 *op = buf;
> + struct nls_table *nls = sbi->nls;
> + int uni_len = uni->len;
> +
> + static_assert(sizeof(wchar_t) == sizeof(__le16));
> +
> + while (uni_len--) {
> + u16 ec;
> + int charlen;
> +
> + if (buf_len < NLS_MAX_CHARSET_SIZE) {
> + ntfs_warning(
> + sbi->sb,
> + "filename was truncated while converting.");
> + break;
> + }
> +
> + ec = le16_to_cpu(*ip++);
> + charlen = nls->uni2char(ec, op, buf_len);
> +
> + if (charlen > 0) {
> + op += charlen;
> + buf_len -= charlen;
> + } else {
> + *op++ = ':';
> + op = hex_byte_pack(op, ec >> 8);
> + op = hex_byte_pack(op, ec);
> + buf_len -= 5;
> + }
> + }
> +
> + *op = 0;
> + return op - buf;
> +}
> +
> +static inline u8 get_digit(u8 d)
> +{
> + u8 x = d & 0xf;
> +
> + return x <= 9 ? ('0' + x) : ('A' + x - 10);
> +}
> +
> +/*
> + * Convert input string to unicode
> + * max_ulen - maximum possible unicode length
> + * endian - unicode endian
> + */
> +int x8_to_uni(ntfs_sb_info *sbi, const u8 *name, u32 name_len,
> + struct cpu_str *uni, u32 max_ulen, enum utf16_endian endian)
> +{
> + int i, ret, clen;
> + u32 tail;
> + const u8 *str = name;
> + const u8 *end = name + name_len;
> + u16 *uname = uni->name;
> + struct nls_table *nls = sbi->nls;
> + int warn = 0;
> +
> + static_assert(sizeof(wchar_t) == sizeof(u16));
> +
> + for (ret = 0; str < end; ret += 1, uname += 1, str += clen) {
> + if (ret >= max_ulen)
> + return -ENAMETOOLONG;
> + tail = end - str;
> +
> + clen = nls->char2uni(str, tail, uname);
> + if (clen > 0)
> + continue;
> +
> + if (!warn) {
> + warn = 1;
> + ntfs_warning(
> + sbi->sb,
> + "%s -> unicode failed: '%.*s', pos %d, chars %x %x %x",
> + nls->charset, name_len, name, (int)(str - name),
> + str[0], tail > 1 ? str[1] : 0,
> + tail > 2 ? str[2] : 0);
> + }
> +
> + if (ret + 3 > max_ulen)
> + return -ENAMETOOLONG;
> +
> + uname[0] = '%';
> + uname[1] = get_digit(*str >> 4);
> + uname[2] = get_digit(*str >> 0);
> +
> + uname += 2;
> + ret += 2; // +1 will be added in for ( .... )
> + clen = 1;
> + }
> +
> +#ifdef __BIG_ENDIAN
> + if (endian == UTF16_LITTLE_ENDIAN) {
> + __le16 *uname = (__le16 *)uni->name;
> +
> + for (i = 0; i < ret; i++, uname++)
> + *uname = cpu_to_le16(*name);
> + }
> +#else
> + if (endian == UTF16_BIG_ENDIAN) {
> + __be16 *uname = (__be16 *)uni->name;
> +
> + for (i = 0; i < ret; i++, uname++)
> + *uname = cpu_to_be16(*name);
> + }
> +#endif
> +
> + uni->len = ret;
> + return ret;
> +}