Re: [PATCH 3/3] fs: unicode: Add utf8 module and a unicode layer

From: Shreeya Patel
Date: Sun Mar 14 2021 - 10:47:34 EST



On 14/03/21 7:19 am, Gabriel Krisman Bertazi wrote:
Shreeya Patel <shreeya.patel@xxxxxxxxxxxxx> writes:

utf8data.h_shipped has a large database table which is an auto-generated
decodification trie for the unicode normalization functions.
It is not necessary to carry this large table in the kernel hence make
UTF-8 encoding loadable by converting it into a module.
Also, modify the file called unicode-core which will act as a layer for
unicode subsystem. It will load the UTF-8 module and access it's functions
whenever any filesystem that needs unicode is mounted.

Signed-off-by: Shreeya Patel <shreeya.patel@xxxxxxxxxxxxx>
Hi Shreeya,
Hi Gabriel,

---
fs/unicode/Kconfig | 7 +-
fs/unicode/Makefile | 5 +-
fs/unicode/unicode-core.c | 201 ++++++-------------------------
fs/unicode/utf8-core.c | 112 +++++++++++++++++
fs/unicode/utf8mod.c | 246 ++++++++++++++++++++++++++++++++++++++
include/linux/unicode.h | 20 ++++
6 files changed, 427 insertions(+), 164 deletions(-)
create mode 100644 fs/unicode/utf8-core.c
create mode 100644 fs/unicode/utf8mod.c

diff --git a/fs/unicode/Kconfig b/fs/unicode/Kconfig
index 2c27b9a5cd6c..33a27deef729 100644
--- a/fs/unicode/Kconfig
+++ b/fs/unicode/Kconfig
@@ -8,7 +8,12 @@ config UNICODE
Say Y here to enable UTF-8 NFD normalization and NFD+CF casefolding
support.
+config UNICODE_UTF8
+ tristate "UTF-8 module"
+ depends on UNICODE
+ default m
+
config UNICODE_NORMALIZATION_SELFTEST
tristate "Test UTF-8 normalization support"
- depends on UNICODE
+ depends on UNICODE_UTF8
default n
diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile
index fbf9a629ed0d..9dbb04194b32 100644
--- a/fs/unicode/Makefile
+++ b/fs/unicode/Makefile
@@ -1,11 +1,14 @@
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_UNICODE) += unicode.o
+obj-$(CONFIG_UNICODE_UTF8) += utf8.o
obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o
-unicode-y := utf8-norm.o unicode-core.o
+unicode-y := unicode-core.o
+utf8-y := utf8mod.o utf8-norm.o
$(obj)/utf8-norm.o: $(obj)/utf8data.h
+$(obj)/utf8mod.o: $(obj)/utf8-norm.o
# In the normal build, the checked-in utf8data.h is just shipped.
#
diff --git a/fs/unicode/unicode-core.c b/fs/unicode/unicode-core.c
index d5f09e022ac5..b832341f1e7b 100644
--- a/fs/unicode/unicode-core.c
+++ b/fs/unicode/unicode-core.c
@@ -7,70 +7,29 @@
#include <linux/errno.h>
#include <linux/unicode.h>
#include <linux/stringhash.h>
+#include <linux/delay.h>
-#include "utf8n.h"
+struct unicode_ops *utf8_ops;
+
+static int unicode_load_module(void);
This is unnecessary
int unicode_validate(const struct unicode_map *um, const struct qstr *str)
{
- const struct utf8data *data = utf8nfdi(um->version);
-
- if (utf8nlen(data, str->name, str->len) < 0)
- return -1;
- return 0;
+ return utf8_ops->validate(um, str);
}
EXPORT_SYMBOL(unicode_validate);
int unicode_strncmp(const struct unicode_map *um,
const struct qstr *s1, const struct qstr *s2)
{
- const struct utf8data *data = utf8nfdi(um->version);
- struct utf8cursor cur1, cur2;
- int c1, c2;
-
- if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
- return -EINVAL;
-
- if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
- return -EINVAL;
-
- do {
- c1 = utf8byte(&cur1);
- c2 = utf8byte(&cur2);
-
- if (c1 < 0 || c2 < 0)
- return -EINVAL;
- if (c1 != c2)
- return 1;
- } while (c1);
-
- return 0;
+ return utf8_ops->strncmp(um, s1, s2);
}
I think these would go on a header file and inlined.

EXPORT_SYMBOL(unicode_strncmp);
int unicode_strncasecmp(const struct unicode_map *um,
const struct qstr *s1, const struct qstr *s2)
{
- const struct utf8data *data = utf8nfdicf(um->version);
- struct utf8cursor cur1, cur2;
- int c1, c2;
-
- if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
- return -EINVAL;
-
- if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
- return -EINVAL;
-
- do {
- c1 = utf8byte(&cur1);
- c2 = utf8byte(&cur2);
-
- if (c1 < 0 || c2 < 0)
- return -EINVAL;
- if (c1 != c2)
- return 1;
- } while (c1);
-
- return 0;
+ return utf8_ops->strncasecmp(um, s1, s2);
}
EXPORT_SYMBOL(unicode_strncasecmp);
@@ -81,155 +40,73 @@ int unicode_strncasecmp_folded(const struct unicode_map *um,
const struct qstr *cf,
const struct qstr *s1)
{
- const struct utf8data *data = utf8nfdicf(um->version);
- struct utf8cursor cur1;
- int c1, c2;
- int i = 0;
-
- if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
- return -EINVAL;
-
- do {
- c1 = utf8byte(&cur1);
- c2 = cf->name[i++];
- if (c1 < 0)
- return -EINVAL;
- if (c1 != c2)
- return 1;
- } while (c1);
-
- return 0;
+ return utf8_ops->strncasecmp_folded(um, cf, s1);
}
EXPORT_SYMBOL(unicode_strncasecmp_folded);
int unicode_casefold(const struct unicode_map *um, const struct qstr *str,
unsigned char *dest, size_t dlen)
{
- const struct utf8data *data = utf8nfdicf(um->version);
- struct utf8cursor cur;
- size_t nlen = 0;
-
- if (utf8ncursor(&cur, data, str->name, str->len) < 0)
- return -EINVAL;
-
- for (nlen = 0; nlen < dlen; nlen++) {
- int c = utf8byte(&cur);
-
- dest[nlen] = c;
- if (!c)
- return nlen;
- if (c == -1)
- break;
- }
- return -EINVAL;
+ return utf8_ops->casefold(um, str, dest, dlen);
}
EXPORT_SYMBOL(unicode_casefold);
int unicode_casefold_hash(const struct unicode_map *um, const void *salt,
struct qstr *str)
{
- const struct utf8data *data = utf8nfdicf(um->version);
- struct utf8cursor cur;
- int c;
- unsigned long hash = init_name_hash(salt);
-
- if (utf8ncursor(&cur, data, str->name, str->len) < 0)
- return -EINVAL;
-
- while ((c = utf8byte(&cur))) {
- if (c < 0)
- return -EINVAL;
- hash = partial_name_hash((unsigned char)c, hash);
- }
- str->hash = end_name_hash(hash);
- return 0;
+ return utf8_ops->casefold_hash(um, salt, str);
}
EXPORT_SYMBOL(unicode_casefold_hash);
int unicode_normalize(const struct unicode_map *um, const struct qstr *str,
unsigned char *dest, size_t dlen)
{
- const struct utf8data *data = utf8nfdi(um->version);
- struct utf8cursor cur;
- ssize_t nlen = 0;
+ return utf8_ops->normalize(um, str, dest, dlen);
+}
+EXPORT_SYMBOL(unicode_normalize);
- if (utf8ncursor(&cur, data, str->name, str->len) < 0)
- return -EINVAL;
+struct unicode_map *unicode_load(const char *version)
+{
+ int ret = unicode_load_module();
- for (nlen = 0; nlen < dlen; nlen++) {
- int c = utf8byte(&cur);
+ if (ret)
+ return ERR_PTR(ret);
- dest[nlen] = c;
- if (!c)
- return nlen;
- if (c == -1)
- break;
- }
- return -EINVAL;
+ else
+ return utf8_ops->load(version);
}
-EXPORT_SYMBOL(unicode_normalize);
+EXPORT_SYMBOL(unicode_load);
-static int unicode_parse_version(const char *version, unsigned int *maj,
- unsigned int *min, unsigned int *rev)
+void unicode_unload(struct unicode_map *um)
{
- substring_t args[3];
- char version_string[12];
- static const struct match_token token[] = {
- {1, "%d.%d.%d"},
- {0, NULL}
- };
+ kfree(um);
+}
+EXPORT_SYMBOL(unicode_unload);
- strncpy(version_string, version, sizeof(version_string));
+static int unicode_load_module(void)
+{
+ int ret = request_module("utf8");
- if (match_token(version_string, token, args) != 1)
- return -EINVAL;
+ msleep(100);
I think I misunderstood when you mentioned you did this msleep. It was
ok to debug the issue you were observing, but it is not a solution.
Setting an arbitrary amount of time will either waste time, or you can
still fail if things take longer than expected. There are mechanisms to
load and wait on a module. See how fs/nls/nls_base.c do exactly this.

- if (match_int(&args[0], maj) || match_int(&args[1], min) ||
- match_int(&args[2], rev))
- return -EINVAL;
+ if (ret) {
+ pr_err("Failed to load UTF-8 module\n");
+ return ret;
+ }
return 0;
}
-struct unicode_map *unicode_load(const char *version)
+void unicode_register(struct unicode_ops *ops)
{
- struct unicode_map *um = NULL;
- int unicode_version;
-
- if (version) {
- unsigned int maj, min, rev;
-
- if (unicode_parse_version(version, &maj, &min, &rev) < 0)
- return ERR_PTR(-EINVAL);
-
- if (!utf8version_is_supported(maj, min, rev))
- return ERR_PTR(-EINVAL);
-
- unicode_version = UNICODE_AGE(maj, min, rev);
- } else {
- unicode_version = utf8version_latest();
- printk(KERN_WARNING"UTF-8 version not specified. "
- "Assuming latest supported version (%d.%d.%d).",
- (unicode_version >> 16) & 0xff,
- (unicode_version >> 8) & 0xff,
- (unicode_version & 0xff));
- }
-
- um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL);
- if (!um)
- return ERR_PTR(-ENOMEM);
-
- um->charset = "UTF-8";
- um->version = unicode_version;
-
- return um;
+ utf8_ops = ops;
}
-EXPORT_SYMBOL(unicode_load);
+EXPORT_SYMBOL(unicode_register);
-void unicode_unload(struct unicode_map *um)
+void unicode_unregister(void)
{
- kfree(um);
+ utf8_ops = NULL;
}
-EXPORT_SYMBOL(unicode_unload);
+EXPORT_SYMBOL(unicode_unregister);
MODULE_LICENSE("GPL v2");
diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c
new file mode 100644
index 000000000000..009faa68330c
--- /dev/null
+++ b/fs/unicode/utf8-core.c
@@ -0,0 +1,112 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/parser.h>
+#include <linux/errno.h>
+#include <linux/unicode.h>
+#include <linux/stringhash.h>
+#include <linux/delay.h>
+
+struct unicode_ops *utf8_ops;
+
+static int unicode_load_module(void);
+
+int unicode_validate(const struct unicode_map *um, const struct qstr *str)
+{
+ return utf8_ops->validate(um, str);
+}
+EXPORT_SYMBOL(unicode_validate);
+
+int unicode_strncmp(const struct unicode_map *um,
+ const struct qstr *s1, const struct qstr *s2)
+{
+ return utf8_ops->strncmp(um, s1, s2);
+}
+EXPORT_SYMBOL(unicode_strncmp);
I'm confused now. Isn't this redefining unicode_strncmp ? It was
defined in unicode_core.c on the hunk above and now it is redefined on
utf8_core.c. There is something odd here.
sorry, I think I messed up patches while using git send-email and that is why you might see
two copies of the last patch. Let me resend the series and then it might make sense. One question
though, why would unicode_strncmp go into the header file?

+
+int unicode_strncasecmp(const struct unicode_map *um,
+ const struct qstr *s1, const struct qstr *s2)
+{
+ return utf8_ops->strncasecmp(um, s1, s2);
+}
+EXPORT_SYMBOL(unicode_strncasecmp);
+
+/* String cf is expected to be a valid UTF-8 casefolded
+ * string.
+ */
+int unicode_strncasecmp_folded(const struct unicode_map *um,
+ const struct qstr *cf,
+ const struct qstr *s1)
+{
+ return utf8_ops->strncasecmp_folded(um, cf, s1);
+}
+EXPORT_SYMBOL(unicode_strncasecmp_folded);
+
+int unicode_casefold(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen)
+{
+ return utf8_ops->casefold(um, str, dest, dlen);
+}
+EXPORT_SYMBOL(unicode_casefold);
+
+int unicode_casefold_hash(const struct unicode_map *um, const void *salt,
+ struct qstr *str)
+{
+ return utf8_ops->casefold_hash(um, salt, str);
+}
+EXPORT_SYMBOL(unicode_casefold_hash);
+
+int unicode_normalize(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen)
+{
+ return utf8_ops->normalize(um, str, dest, dlen);
+}
+EXPORT_SYMBOL(unicode_normalize);
+
+struct unicode_map *unicode_load(const char *version)
+{
+ int ret = unicode_load_module();
+
+ if (ret)
+ return ERR_PTR(ret);
+
+ else
+ return utf8_ops->load(version);
+}
+EXPORT_SYMBOL(unicode_load);
+
+void unicode_unload(struct unicode_map *um)
+{
+ kfree(um);
+}
+EXPORT_SYMBOL(unicode_unload);
+
+void unicode_register(struct unicode_ops *ops)
+{
+ utf8_ops = ops;
+}
+EXPORT_SYMBOL(unicode_register);
+
+void unicode_unregister(void)
+{
+ utf8_ops = NULL;
+}
+EXPORT_SYMBOL(unicode_unregister);
+
+static int unicode_load_module(void)
+{
+ int ret = request_module("utf8");
+
+ msleep(100);
+
+ if (ret) {
+ pr_err("Failed to load UTF-8 module\n");
+ return ret;
+ }
+
+ return 0;
+}
+
+MODULE_LICENSE("GPL v2");
diff --git a/fs/unicode/utf8mod.c b/fs/unicode/utf8mod.c
new file mode 100644
index 000000000000..8eaeeb27255c
--- /dev/null
+++ b/fs/unicode/utf8mod.c
@@ -0,0 +1,246 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/parser.h>
+#include <linux/errno.h>
+#include <linux/unicode.h>
+#include <linux/stringhash.h>
+
+#include "utf8n.h"
+
+static int utf8_validate(const struct unicode_map *um, const struct qstr *str)
+{
+ const struct utf8data *data = utf8nfdi(um->version);
+
+ if (utf8nlen(data, str->name, str->len) < 0)
+ return -1;
+ return 0;
+}
+
+static int utf8_strncmp(const struct unicode_map *um,
+ const struct qstr *s1, const struct qstr *s2)
+{
+ const struct utf8data *data = utf8nfdi(um->version);
+ struct utf8cursor cur1, cur2;
+ int c1, c2;
+
+ if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
+ return -EINVAL;
+
+ if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
+ return -EINVAL;
+
+ do {
+ c1 = utf8byte(&cur1);
+ c2 = utf8byte(&cur2);
+
+ if (c1 < 0 || c2 < 0)
+ return -EINVAL;
+ if (c1 != c2)
+ return 1;
+ } while (c1);
+
+ return 0;
+}
+
+static int utf8_strncasecmp(const struct unicode_map *um,
+ const struct qstr *s1, const struct qstr *s2)
+{
+ const struct utf8data *data = utf8nfdicf(um->version);
+ struct utf8cursor cur1, cur2;
+ int c1, c2;
+
+ if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
+ return -EINVAL;
+
+ if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
+ return -EINVAL;
+
+ do {
+ c1 = utf8byte(&cur1);
+ c2 = utf8byte(&cur2);
+
+ if (c1 < 0 || c2 < 0)
+ return -EINVAL;
+ if (c1 != c2)
+ return 1;
+ } while (c1);
+
+ return 0;
+}
+
+/* String cf is expected to be a valid UTF-8 casefolded
+ * string.
+ */
+static int utf8_strncasecmp_folded(const struct unicode_map *um,
+ const struct qstr *cf,
+ const struct qstr *s1)
+{
+ const struct utf8data *data = utf8nfdicf(um->version);
+ struct utf8cursor cur1;
+ int c1, c2;
+ int i = 0;
+
+ if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
+ return -EINVAL;
+
+ do {
+ c1 = utf8byte(&cur1);
+ c2 = cf->name[i++];
+ if (c1 < 0)
+ return -EINVAL;
+ if (c1 != c2)
+ return 1;
+ } while (c1);
+
+ return 0;
+}
+
+static int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen)
+{
+ const struct utf8data *data = utf8nfdicf(um->version);
+ struct utf8cursor cur;
+ size_t nlen = 0;
+
+ if (utf8ncursor(&cur, data, str->name, str->len) < 0)
+ return -EINVAL;
+
+ for (nlen = 0; nlen < dlen; nlen++) {
+ int c = utf8byte(&cur);
+
+ dest[nlen] = c;
+ if (!c)
+ return nlen;
+ if (c == -1)
+ break;
+ }
+ return -EINVAL;
+}
+
+static int utf8_casefold_hash(const struct unicode_map *um, const void *salt,
+ struct qstr *str)
+{
+ const struct utf8data *data = utf8nfdicf(um->version);
+ struct utf8cursor cur;
+ int c;
+ unsigned long hash = init_name_hash(salt);
+
+ if (utf8ncursor(&cur, data, str->name, str->len) < 0)
+ return -EINVAL;
+
+ while ((c = utf8byte(&cur))) {
+ if (c < 0)
+ return -EINVAL;
+ hash = partial_name_hash((unsigned char)c, hash);
+ }
+ str->hash = end_name_hash(hash);
+ return 0;
+}
+
+static int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen)
+{
+ const struct utf8data *data = utf8nfdi(um->version);
+ struct utf8cursor cur;
+ ssize_t nlen = 0;
+
+ if (utf8ncursor(&cur, data, str->name, str->len) < 0)
+ return -EINVAL;
+
+ for (nlen = 0; nlen < dlen; nlen++) {
+ int c = utf8byte(&cur);
+
+ dest[nlen] = c;
+ if (!c)
+ return nlen;
+ if (c == -1)
+ break;
+ }
+ return -EINVAL;
+}
+
+static int utf8_parse_version(const char *version, unsigned int *maj,
+ unsigned int *min, unsigned int *rev)
+{
+ substring_t args[3];
+ char version_string[12];
+ static const struct match_token token[] = {
+ {1, "%d.%d.%d"},
+ {0, NULL}
+ };
+
+ strncpy(version_string, version, sizeof(version_string));
+
+ if (match_token(version_string, token, args) != 1)
+ return -EINVAL;
+
+ if (match_int(&args[0], maj) || match_int(&args[1], min) ||
+ match_int(&args[2], rev))
+ return -EINVAL;
+
+ return 0;
+}
+
+static struct unicode_map *utf8_load(const char *version)
+{
+ struct unicode_map *um = NULL;
+ int unicode_version;
+
+ if (version) {
+ unsigned int maj, min, rev;
+
+ if (utf8_parse_version(version, &maj, &min, &rev) < 0)
+ return ERR_PTR(-EINVAL);
+
+ if (!utf8version_is_supported(maj, min, rev))
+ return ERR_PTR(-EINVAL);
+
+ unicode_version = UNICODE_AGE(maj, min, rev);
+ } else {
+ unicode_version = utf8version_latest();
+ printk(KERN_WARNING"UTF-8 version not specified. "
+ "Assuming latest supported version (%d.%d.%d).",
+ (unicode_version >> 16) & 0xff,
+ (unicode_version >> 8) & 0xff,
+ (unicode_version & 0xff));
+ }
+
+ um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL);
+ if (!um)
+ return ERR_PTR(-ENOMEM);
+
+ um->charset = "UTF-8";
+ um->version = unicode_version;
+
+ return um;
+}
+
+static struct unicode_ops ops = {
+ .validate = utf8_validate,
+ .strncmp = utf8_strncmp,
+ .strncasecmp = utf8_strncasecmp,
+ .strncasecmp_folded = utf8_strncasecmp_folded,
+ .casefold = utf8_casefold,
+ .casefold_hash = utf8_casefold_hash,
+ .normalize = utf8_normalize,
+ .load = utf8_load,
+};
+
+static int __init utf8_init(void)
+{
+ unicode_register(&ops);
+ return 0;
+}
+
+static void __exit utf8_exit(void)
+{
+ unicode_unregister();
+}
+
+module_init(utf8_init);
+module_exit(utf8_exit);
+
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/unicode.h b/include/linux/unicode.h
index de23f9ee720b..b0d59069e438 100644
--- a/include/linux/unicode.h
+++ b/include/linux/unicode.h
@@ -10,6 +10,23 @@ struct unicode_map {
int version;
};
+struct unicode_ops {
+ int (*validate)(const struct unicode_map *um, const struct qstr *str);
+ int (*strncmp)(const struct unicode_map *um, const struct qstr *s1,
+ const struct qstr *s2);
+ int (*strncasecmp)(const struct unicode_map *um, const struct qstr *s1,
+ const struct qstr *s2);
+ int (*strncasecmp_folded)(const struct unicode_map *um, const struct qstr *cf,
+ const struct qstr *s1);
+ int (*normalize)(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen);
+ int (*casefold)(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen);
+ int (*casefold_hash)(const struct unicode_map *um, const void *salt,
+ struct qstr *str);
+ struct unicode_map* (*load)(const char *version);
+};
Also, make sure you run checkpatch.pl on the patch series before
submitting.
I ran checkpatch.pl over the patch, but it seems there were some previously existing warnings
which are not introduced due to any change made in this patch series.
I am not sure if I am supposed to resolve those warnings in this patch series.

+
int unicode_validate(const struct unicode_map *um, const struct qstr *str);
int unicode_strncmp(const struct unicode_map *um,
@@ -33,4 +50,7 @@ int unicode_casefold_hash(const struct unicode_map *um, const void *salt,
struct unicode_map *unicode_load(const char *version);
void unicode_unload(struct unicode_map *um);
+void unicode_register(struct unicode_ops *ops);
+void unicode_unregister(void);
+
#endif /* _LINUX_UNICODE_H */