[PATCH 3/3] fs: unicode: Add utf8 module and a unicode layer

From: Shreeya Patel
Date: Sat Mar 13 2021 - 18:14:28 EST


utf8data.h_shipped has a large database table which is an auto-generated
decodification trie for the unicode normalization functions.
It is not necessary to carry this large table in the kernel hence make
UTF-8 encoding loadable by converting it into a module.
Also, modify the file called unicode-core which will act as a layer for
unicode subsystem. It will load the UTF-8 module and access it's functions
whenever any filesystem that needs unicode is mounted.

Signed-off-by: Shreeya Patel <shreeya.patel@xxxxxxxxxxxxx>
---
fs/unicode/Kconfig | 7 +-
fs/unicode/Makefile | 5 +-
fs/unicode/unicode-core.c | 201 ++++++-------------------------
fs/unicode/utf8-core.c | 112 +++++++++++++++++
fs/unicode/utf8mod.c | 246 ++++++++++++++++++++++++++++++++++++++
include/linux/unicode.h | 20 ++++
6 files changed, 427 insertions(+), 164 deletions(-)
create mode 100644 fs/unicode/utf8-core.c
create mode 100644 fs/unicode/utf8mod.c

diff --git a/fs/unicode/Kconfig b/fs/unicode/Kconfig
index 2c27b9a5cd6c..33a27deef729 100644
--- a/fs/unicode/Kconfig
+++ b/fs/unicode/Kconfig
@@ -8,7 +8,12 @@ config UNICODE
Say Y here to enable UTF-8 NFD normalization and NFD+CF casefolding
support.

+config UNICODE_UTF8
+ tristate "UTF-8 module"
+ depends on UNICODE
+ default m
+
config UNICODE_NORMALIZATION_SELFTEST
tristate "Test UTF-8 normalization support"
- depends on UNICODE
+ depends on UNICODE_UTF8
default n
diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile
index fbf9a629ed0d..9dbb04194b32 100644
--- a/fs/unicode/Makefile
+++ b/fs/unicode/Makefile
@@ -1,11 +1,14 @@
# SPDX-License-Identifier: GPL-2.0

obj-$(CONFIG_UNICODE) += unicode.o
+obj-$(CONFIG_UNICODE_UTF8) += utf8.o
obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o

-unicode-y := utf8-norm.o unicode-core.o
+unicode-y := unicode-core.o
+utf8-y := utf8mod.o utf8-norm.o

$(obj)/utf8-norm.o: $(obj)/utf8data.h
+$(obj)/utf8mod.o: $(obj)/utf8-norm.o

# In the normal build, the checked-in utf8data.h is just shipped.
#
diff --git a/fs/unicode/unicode-core.c b/fs/unicode/unicode-core.c
index d5f09e022ac5..b832341f1e7b 100644
--- a/fs/unicode/unicode-core.c
+++ b/fs/unicode/unicode-core.c
@@ -7,70 +7,29 @@
#include <linux/errno.h>
#include <linux/unicode.h>
#include <linux/stringhash.h>
+#include <linux/delay.h>

-#include "utf8n.h"
+struct unicode_ops *utf8_ops;
+
+static int unicode_load_module(void);

int unicode_validate(const struct unicode_map *um, const struct qstr *str)
{
- const struct utf8data *data = utf8nfdi(um->version);
-
- if (utf8nlen(data, str->name, str->len) < 0)
- return -1;
- return 0;
+ return utf8_ops->validate(um, str);
}
EXPORT_SYMBOL(unicode_validate);

int unicode_strncmp(const struct unicode_map *um,
const struct qstr *s1, const struct qstr *s2)
{
- const struct utf8data *data = utf8nfdi(um->version);
- struct utf8cursor cur1, cur2;
- int c1, c2;
-
- if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
- return -EINVAL;
-
- if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
- return -EINVAL;
-
- do {
- c1 = utf8byte(&cur1);
- c2 = utf8byte(&cur2);
-
- if (c1 < 0 || c2 < 0)
- return -EINVAL;
- if (c1 != c2)
- return 1;
- } while (c1);
-
- return 0;
+ return utf8_ops->strncmp(um, s1, s2);
}
EXPORT_SYMBOL(unicode_strncmp);

int unicode_strncasecmp(const struct unicode_map *um,
const struct qstr *s1, const struct qstr *s2)
{
- const struct utf8data *data = utf8nfdicf(um->version);
- struct utf8cursor cur1, cur2;
- int c1, c2;
-
- if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
- return -EINVAL;
-
- if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
- return -EINVAL;
-
- do {
- c1 = utf8byte(&cur1);
- c2 = utf8byte(&cur2);
-
- if (c1 < 0 || c2 < 0)
- return -EINVAL;
- if (c1 != c2)
- return 1;
- } while (c1);
-
- return 0;
+ return utf8_ops->strncasecmp(um, s1, s2);
}
EXPORT_SYMBOL(unicode_strncasecmp);

@@ -81,155 +40,73 @@ int unicode_strncasecmp_folded(const struct unicode_map *um,
const struct qstr *cf,
const struct qstr *s1)
{
- const struct utf8data *data = utf8nfdicf(um->version);
- struct utf8cursor cur1;
- int c1, c2;
- int i = 0;
-
- if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
- return -EINVAL;
-
- do {
- c1 = utf8byte(&cur1);
- c2 = cf->name[i++];
- if (c1 < 0)
- return -EINVAL;
- if (c1 != c2)
- return 1;
- } while (c1);
-
- return 0;
+ return utf8_ops->strncasecmp_folded(um, cf, s1);
}
EXPORT_SYMBOL(unicode_strncasecmp_folded);

int unicode_casefold(const struct unicode_map *um, const struct qstr *str,
unsigned char *dest, size_t dlen)
{
- const struct utf8data *data = utf8nfdicf(um->version);
- struct utf8cursor cur;
- size_t nlen = 0;
-
- if (utf8ncursor(&cur, data, str->name, str->len) < 0)
- return -EINVAL;
-
- for (nlen = 0; nlen < dlen; nlen++) {
- int c = utf8byte(&cur);
-
- dest[nlen] = c;
- if (!c)
- return nlen;
- if (c == -1)
- break;
- }
- return -EINVAL;
+ return utf8_ops->casefold(um, str, dest, dlen);
}
EXPORT_SYMBOL(unicode_casefold);

int unicode_casefold_hash(const struct unicode_map *um, const void *salt,
struct qstr *str)
{
- const struct utf8data *data = utf8nfdicf(um->version);
- struct utf8cursor cur;
- int c;
- unsigned long hash = init_name_hash(salt);
-
- if (utf8ncursor(&cur, data, str->name, str->len) < 0)
- return -EINVAL;
-
- while ((c = utf8byte(&cur))) {
- if (c < 0)
- return -EINVAL;
- hash = partial_name_hash((unsigned char)c, hash);
- }
- str->hash = end_name_hash(hash);
- return 0;
+ return utf8_ops->casefold_hash(um, salt, str);
}
EXPORT_SYMBOL(unicode_casefold_hash);

int unicode_normalize(const struct unicode_map *um, const struct qstr *str,
unsigned char *dest, size_t dlen)
{
- const struct utf8data *data = utf8nfdi(um->version);
- struct utf8cursor cur;
- ssize_t nlen = 0;
+ return utf8_ops->normalize(um, str, dest, dlen);
+}
+EXPORT_SYMBOL(unicode_normalize);

- if (utf8ncursor(&cur, data, str->name, str->len) < 0)
- return -EINVAL;
+struct unicode_map *unicode_load(const char *version)
+{
+ int ret = unicode_load_module();

- for (nlen = 0; nlen < dlen; nlen++) {
- int c = utf8byte(&cur);
+ if (ret)
+ return ERR_PTR(ret);

- dest[nlen] = c;
- if (!c)
- return nlen;
- if (c == -1)
- break;
- }
- return -EINVAL;
+ else
+ return utf8_ops->load(version);
}
-EXPORT_SYMBOL(unicode_normalize);
+EXPORT_SYMBOL(unicode_load);

-static int unicode_parse_version(const char *version, unsigned int *maj,
- unsigned int *min, unsigned int *rev)
+void unicode_unload(struct unicode_map *um)
{
- substring_t args[3];
- char version_string[12];
- static const struct match_token token[] = {
- {1, "%d.%d.%d"},
- {0, NULL}
- };
+ kfree(um);
+}
+EXPORT_SYMBOL(unicode_unload);

- strncpy(version_string, version, sizeof(version_string));
+static int unicode_load_module(void)
+{
+ int ret = request_module("utf8");

- if (match_token(version_string, token, args) != 1)
- return -EINVAL;
+ msleep(100);

- if (match_int(&args[0], maj) || match_int(&args[1], min) ||
- match_int(&args[2], rev))
- return -EINVAL;
+ if (ret) {
+ pr_err("Failed to load UTF-8 module\n");
+ return ret;
+ }

return 0;
}

-struct unicode_map *unicode_load(const char *version)
+void unicode_register(struct unicode_ops *ops)
{
- struct unicode_map *um = NULL;
- int unicode_version;
-
- if (version) {
- unsigned int maj, min, rev;
-
- if (unicode_parse_version(version, &maj, &min, &rev) < 0)
- return ERR_PTR(-EINVAL);
-
- if (!utf8version_is_supported(maj, min, rev))
- return ERR_PTR(-EINVAL);
-
- unicode_version = UNICODE_AGE(maj, min, rev);
- } else {
- unicode_version = utf8version_latest();
- printk(KERN_WARNING"UTF-8 version not specified. "
- "Assuming latest supported version (%d.%d.%d).",
- (unicode_version >> 16) & 0xff,
- (unicode_version >> 8) & 0xff,
- (unicode_version & 0xff));
- }
-
- um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL);
- if (!um)
- return ERR_PTR(-ENOMEM);
-
- um->charset = "UTF-8";
- um->version = unicode_version;
-
- return um;
+ utf8_ops = ops;
}
-EXPORT_SYMBOL(unicode_load);
+EXPORT_SYMBOL(unicode_register);

-void unicode_unload(struct unicode_map *um)
+void unicode_unregister(void)
{
- kfree(um);
+ utf8_ops = NULL;
}
-EXPORT_SYMBOL(unicode_unload);
+EXPORT_SYMBOL(unicode_unregister);

MODULE_LICENSE("GPL v2");
diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c
new file mode 100644
index 000000000000..009faa68330c
--- /dev/null
+++ b/fs/unicode/utf8-core.c
@@ -0,0 +1,112 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/parser.h>
+#include <linux/errno.h>
+#include <linux/unicode.h>
+#include <linux/stringhash.h>
+#include <linux/delay.h>
+
+struct unicode_ops *utf8_ops;
+
+static int unicode_load_module(void);
+
+int unicode_validate(const struct unicode_map *um, const struct qstr *str)
+{
+ return utf8_ops->validate(um, str);
+}
+EXPORT_SYMBOL(unicode_validate);
+
+int unicode_strncmp(const struct unicode_map *um,
+ const struct qstr *s1, const struct qstr *s2)
+{
+ return utf8_ops->strncmp(um, s1, s2);
+}
+EXPORT_SYMBOL(unicode_strncmp);
+
+int unicode_strncasecmp(const struct unicode_map *um,
+ const struct qstr *s1, const struct qstr *s2)
+{
+ return utf8_ops->strncasecmp(um, s1, s2);
+}
+EXPORT_SYMBOL(unicode_strncasecmp);
+
+/* String cf is expected to be a valid UTF-8 casefolded
+ * string.
+ */
+int unicode_strncasecmp_folded(const struct unicode_map *um,
+ const struct qstr *cf,
+ const struct qstr *s1)
+{
+ return utf8_ops->strncasecmp_folded(um, cf, s1);
+}
+EXPORT_SYMBOL(unicode_strncasecmp_folded);
+
+int unicode_casefold(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen)
+{
+ return utf8_ops->casefold(um, str, dest, dlen);
+}
+EXPORT_SYMBOL(unicode_casefold);
+
+int unicode_casefold_hash(const struct unicode_map *um, const void *salt,
+ struct qstr *str)
+{
+ return utf8_ops->casefold_hash(um, salt, str);
+}
+EXPORT_SYMBOL(unicode_casefold_hash);
+
+int unicode_normalize(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen)
+{
+ return utf8_ops->normalize(um, str, dest, dlen);
+}
+EXPORT_SYMBOL(unicode_normalize);
+
+struct unicode_map *unicode_load(const char *version)
+{
+ int ret = unicode_load_module();
+
+ if (ret)
+ return ERR_PTR(ret);
+
+ else
+ return utf8_ops->load(version);
+}
+EXPORT_SYMBOL(unicode_load);
+
+void unicode_unload(struct unicode_map *um)
+{
+ kfree(um);
+}
+EXPORT_SYMBOL(unicode_unload);
+
+void unicode_register(struct unicode_ops *ops)
+{
+ utf8_ops = ops;
+}
+EXPORT_SYMBOL(unicode_register);
+
+void unicode_unregister(void)
+{
+ utf8_ops = NULL;
+}
+EXPORT_SYMBOL(unicode_unregister);
+
+static int unicode_load_module(void)
+{
+ int ret = request_module("utf8");
+
+ msleep(100);
+
+ if (ret) {
+ pr_err("Failed to load UTF-8 module\n");
+ return ret;
+ }
+
+ return 0;
+}
+
+MODULE_LICENSE("GPL v2");
diff --git a/fs/unicode/utf8mod.c b/fs/unicode/utf8mod.c
new file mode 100644
index 000000000000..8eaeeb27255c
--- /dev/null
+++ b/fs/unicode/utf8mod.c
@@ -0,0 +1,246 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/parser.h>
+#include <linux/errno.h>
+#include <linux/unicode.h>
+#include <linux/stringhash.h>
+
+#include "utf8n.h"
+
+static int utf8_validate(const struct unicode_map *um, const struct qstr *str)
+{
+ const struct utf8data *data = utf8nfdi(um->version);
+
+ if (utf8nlen(data, str->name, str->len) < 0)
+ return -1;
+ return 0;
+}
+
+static int utf8_strncmp(const struct unicode_map *um,
+ const struct qstr *s1, const struct qstr *s2)
+{
+ const struct utf8data *data = utf8nfdi(um->version);
+ struct utf8cursor cur1, cur2;
+ int c1, c2;
+
+ if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
+ return -EINVAL;
+
+ if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
+ return -EINVAL;
+
+ do {
+ c1 = utf8byte(&cur1);
+ c2 = utf8byte(&cur2);
+
+ if (c1 < 0 || c2 < 0)
+ return -EINVAL;
+ if (c1 != c2)
+ return 1;
+ } while (c1);
+
+ return 0;
+}
+
+static int utf8_strncasecmp(const struct unicode_map *um,
+ const struct qstr *s1, const struct qstr *s2)
+{
+ const struct utf8data *data = utf8nfdicf(um->version);
+ struct utf8cursor cur1, cur2;
+ int c1, c2;
+
+ if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
+ return -EINVAL;
+
+ if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
+ return -EINVAL;
+
+ do {
+ c1 = utf8byte(&cur1);
+ c2 = utf8byte(&cur2);
+
+ if (c1 < 0 || c2 < 0)
+ return -EINVAL;
+ if (c1 != c2)
+ return 1;
+ } while (c1);
+
+ return 0;
+}
+
+/* String cf is expected to be a valid UTF-8 casefolded
+ * string.
+ */
+static int utf8_strncasecmp_folded(const struct unicode_map *um,
+ const struct qstr *cf,
+ const struct qstr *s1)
+{
+ const struct utf8data *data = utf8nfdicf(um->version);
+ struct utf8cursor cur1;
+ int c1, c2;
+ int i = 0;
+
+ if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
+ return -EINVAL;
+
+ do {
+ c1 = utf8byte(&cur1);
+ c2 = cf->name[i++];
+ if (c1 < 0)
+ return -EINVAL;
+ if (c1 != c2)
+ return 1;
+ } while (c1);
+
+ return 0;
+}
+
+static int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen)
+{
+ const struct utf8data *data = utf8nfdicf(um->version);
+ struct utf8cursor cur;
+ size_t nlen = 0;
+
+ if (utf8ncursor(&cur, data, str->name, str->len) < 0)
+ return -EINVAL;
+
+ for (nlen = 0; nlen < dlen; nlen++) {
+ int c = utf8byte(&cur);
+
+ dest[nlen] = c;
+ if (!c)
+ return nlen;
+ if (c == -1)
+ break;
+ }
+ return -EINVAL;
+}
+
+static int utf8_casefold_hash(const struct unicode_map *um, const void *salt,
+ struct qstr *str)
+{
+ const struct utf8data *data = utf8nfdicf(um->version);
+ struct utf8cursor cur;
+ int c;
+ unsigned long hash = init_name_hash(salt);
+
+ if (utf8ncursor(&cur, data, str->name, str->len) < 0)
+ return -EINVAL;
+
+ while ((c = utf8byte(&cur))) {
+ if (c < 0)
+ return -EINVAL;
+ hash = partial_name_hash((unsigned char)c, hash);
+ }
+ str->hash = end_name_hash(hash);
+ return 0;
+}
+
+static int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen)
+{
+ const struct utf8data *data = utf8nfdi(um->version);
+ struct utf8cursor cur;
+ ssize_t nlen = 0;
+
+ if (utf8ncursor(&cur, data, str->name, str->len) < 0)
+ return -EINVAL;
+
+ for (nlen = 0; nlen < dlen; nlen++) {
+ int c = utf8byte(&cur);
+
+ dest[nlen] = c;
+ if (!c)
+ return nlen;
+ if (c == -1)
+ break;
+ }
+ return -EINVAL;
+}
+
+static int utf8_parse_version(const char *version, unsigned int *maj,
+ unsigned int *min, unsigned int *rev)
+{
+ substring_t args[3];
+ char version_string[12];
+ static const struct match_token token[] = {
+ {1, "%d.%d.%d"},
+ {0, NULL}
+ };
+
+ strncpy(version_string, version, sizeof(version_string));
+
+ if (match_token(version_string, token, args) != 1)
+ return -EINVAL;
+
+ if (match_int(&args[0], maj) || match_int(&args[1], min) ||
+ match_int(&args[2], rev))
+ return -EINVAL;
+
+ return 0;
+}
+
+static struct unicode_map *utf8_load(const char *version)
+{
+ struct unicode_map *um = NULL;
+ int unicode_version;
+
+ if (version) {
+ unsigned int maj, min, rev;
+
+ if (utf8_parse_version(version, &maj, &min, &rev) < 0)
+ return ERR_PTR(-EINVAL);
+
+ if (!utf8version_is_supported(maj, min, rev))
+ return ERR_PTR(-EINVAL);
+
+ unicode_version = UNICODE_AGE(maj, min, rev);
+ } else {
+ unicode_version = utf8version_latest();
+ printk(KERN_WARNING"UTF-8 version not specified. "
+ "Assuming latest supported version (%d.%d.%d).",
+ (unicode_version >> 16) & 0xff,
+ (unicode_version >> 8) & 0xff,
+ (unicode_version & 0xff));
+ }
+
+ um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL);
+ if (!um)
+ return ERR_PTR(-ENOMEM);
+
+ um->charset = "UTF-8";
+ um->version = unicode_version;
+
+ return um;
+}
+
+static struct unicode_ops ops = {
+ .validate = utf8_validate,
+ .strncmp = utf8_strncmp,
+ .strncasecmp = utf8_strncasecmp,
+ .strncasecmp_folded = utf8_strncasecmp_folded,
+ .casefold = utf8_casefold,
+ .casefold_hash = utf8_casefold_hash,
+ .normalize = utf8_normalize,
+ .load = utf8_load,
+};
+
+static int __init utf8_init(void)
+{
+ unicode_register(&ops);
+ return 0;
+}
+
+static void __exit utf8_exit(void)
+{
+ unicode_unregister();
+}
+
+module_init(utf8_init);
+module_exit(utf8_exit);
+
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/unicode.h b/include/linux/unicode.h
index de23f9ee720b..b0d59069e438 100644
--- a/include/linux/unicode.h
+++ b/include/linux/unicode.h
@@ -10,6 +10,23 @@ struct unicode_map {
int version;
};

+struct unicode_ops {
+ int (*validate)(const struct unicode_map *um, const struct qstr *str);
+ int (*strncmp)(const struct unicode_map *um, const struct qstr *s1,
+ const struct qstr *s2);
+ int (*strncasecmp)(const struct unicode_map *um, const struct qstr *s1,
+ const struct qstr *s2);
+ int (*strncasecmp_folded)(const struct unicode_map *um, const struct qstr *cf,
+ const struct qstr *s1);
+ int (*normalize)(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen);
+ int (*casefold)(const struct unicode_map *um, const struct qstr *str,
+ unsigned char *dest, size_t dlen);
+ int (*casefold_hash)(const struct unicode_map *um, const void *salt,
+ struct qstr *str);
+ struct unicode_map* (*load)(const char *version);
+};
+
int unicode_validate(const struct unicode_map *um, const struct qstr *str);

int unicode_strncmp(const struct unicode_map *um,
@@ -33,4 +50,7 @@ int unicode_casefold_hash(const struct unicode_map *um, const void *salt,
struct unicode_map *unicode_load(const char *version);
void unicode_unload(struct unicode_map *um);

+void unicode_register(struct unicode_ops *ops);
+void unicode_unregister(void);
+
#endif /* _LINUX_UNICODE_H */
--
2.30.1