[PATCH 1of2] pstore: new filesystem interface to platform persistent storage

From: Luck, Tony
Date: Tue Dec 28 2010 - 15:27:29 EST


Some platforms have a small amount of non-volatile storage that
can be used to store information useful to diagnose the cause of
a system crash. This is the generic part of a file system interface
that presents information from the crash as a series of files in
/dev/pstore. Once the information has been seen, the underlying
storage is freed by deleting the files.

Signed-off-by: Tony Luck <tony.luck@xxxxxxxxx>

---

Biggest changes from the previous version are
1) Save *all* oopses as well as panics.
2) Make files appear in /dev/pstore on running system so
they can be erased (after being saved) for non-fatal
errors.
3) Loop writing multiple records to log "enough" - default
is 10K, but this can be changed using /sys/fs/pstore/kmsg_bytes.

diff --git a/Documentation/ABI/testing/pstore b/Documentation/ABI/testing/pstore
new file mode 100644
index 0000000..f1fb2a0
--- /dev/null
+++ b/Documentation/ABI/testing/pstore
@@ -0,0 +1,35 @@
+Where: /dev/pstore/...
+Date: January 2011
+Kernel Version: 2.6.38
+Contact: tony.luck@xxxxxxxxx
+Description: Generic interface to platform dependent persistent storage.
+
+ Platforms that provide a mechanism to preserve some data
+ across system reboots can register with this driver to
+ provide a generic interface to show records captured in
+ the dying moments. In the case of a panic the last part
+ of the console log is captured, but other interesting
+ data can also be saved.
+
+ # mount -t pstore - /dev/pstore
+
+ $ ls -l /dev/pstore
+ total 0
+ -r--r--r-- 1 root root 7896 Nov 30 15:38 dmesg-erst-1
+
+ Different users of this interface will result in different
+ filename prefixes. Currently two are defined:
+
+ "dmesg" - saved console log
+ "mce" - architecture dependent data from fatal h/w error
+
+ Once the information in a file has been read, removing
+ the file will signal to the underlying persistent storage
+ device that it can reclaim the space for later re-use.
+
+ $ rm /dev/pstore/dmesg-erst-1
+
+ The expectation is that all files in /dev/pstore
+ will be saved elsewhere and erased from persistent store
+ soon after boot to free up space ready for the next
+ catastrophe.
diff --git a/Documentation/ABI/testing/sysfs-fs-pstore b/Documentation/ABI/testing/sysfs-fs-pstore
new file mode 100644
index 0000000..8e659d8
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-fs-pstore
@@ -0,0 +1,7 @@
+What: /sys/fs/pstore/kmsg_bytes
+Date: January 2011
+Kernel Version: 2.6.38
+Contact: "Tony Luck" <tony.luck@xxxxxxxxx>
+Description:
+ Controls amount of console log that will be saved
+ to persistent store on oops/panic.
diff --git a/fs/Kconfig b/fs/Kconfig
index 771f457..2bbe47f 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -188,6 +188,7 @@ source "fs/omfs/Kconfig"
source "fs/hpfs/Kconfig"
source "fs/qnx4/Kconfig"
source "fs/romfs/Kconfig"
+source "fs/pstore/Kconfig"
source "fs/sysv/Kconfig"
source "fs/ufs/Kconfig"
source "fs/exofs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index a7f7cef..db71a5b 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -121,3 +121,4 @@ obj-$(CONFIG_BTRFS_FS) += btrfs/
obj-$(CONFIG_GFS2_FS) += gfs2/
obj-$(CONFIG_EXOFS_FS) += exofs/
obj-$(CONFIG_CEPH_FS) += ceph/
+obj-$(CONFIG_PSTORE) += pstore/
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
new file mode 100644
index 0000000..867d0ac
--- /dev/null
+++ b/fs/pstore/Kconfig
@@ -0,0 +1,13 @@
+config PSTORE
+ bool "Persistant store support"
+ default n
+ help
+ This option enables generic access to platform level
+ persistent storage via "pstore" filesystem that can
+ be mounted as /dev/pstore. Only useful if you have
+ a platform level driver that registers with pstore to
+ provide the data, so you probably should just go say "Y"
+ (or "M") to a platform specific persistent store driver
+ (e.g. ACPI_APEI on X86) which will select this for you.
+ If you don't have a platform persistent store driver,
+ say N.
diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile
new file mode 100644
index 0000000..760f4bc
--- /dev/null
+++ b/fs/pstore/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the linux pstorefs routines.
+#
+
+obj-y += pstore.o
+
+pstore-objs += inode.o platform.o
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
new file mode 100644
index 0000000..0e806aa
--- /dev/null
+++ b/fs/pstore/inode.c
@@ -0,0 +1,280 @@
+/*
+ * Persistent Storage - ramfs parts.
+ *
+ * Copyright (C) 2010 Intel Corporation <tony.luck@xxxxxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/mount.h>
+#include <linux/ramfs.h>
+#include <linux/sched.h>
+#include <linux/magic.h>
+#include <linux/pstore.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include "internal.h"
+
+#define PSTORE_NAMELEN 64
+
+struct pstore_private {
+ u64 id;
+ int (*erase)(u64);
+};
+
+#define pstore_get_inode ramfs_get_inode
+
+/*
+ * When a file is unlinked from our file system we call the
+ * platform driver to erase the record from persistent store.
+ */
+static int pstore_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct pstore_private *p = dentry->d_inode->i_private;
+
+ p->erase(p->id);
+ kfree(p);
+
+ return simple_unlink(dir, dentry);
+}
+
+static const struct inode_operations pstore_dir_inode_operations = {
+ .lookup = simple_lookup,
+ .unlink = pstore_unlink,
+};
+
+static const struct super_operations pstore_ops = {
+ .statfs = simple_statfs,
+ .drop_inode = generic_delete_inode,
+ .show_options = generic_show_options,
+};
+
+static struct super_block *pstore_sb;
+static struct vfsmount *pstore_mnt;
+
+int pstore_is_mounted(void)
+{
+ return pstore_mnt != NULL;
+}
+
+/*
+ * Set up a file structure as if we had opened this file and
+ * write our data to it.
+ */
+static int pstore_writefile(struct inode *inode, struct dentry *dentry,
+ char *data, size_t size)
+{
+ struct file f;
+ ssize_t n;
+ mm_segment_t old_fs = get_fs();
+
+ memset(&f, '0', sizeof f);
+ f.f_mapping = inode->i_mapping;
+ f.f_path.dentry = dentry;
+ f.f_path.mnt = pstore_mnt;
+ f.f_pos = 0;
+ f.f_op = inode->i_fop;
+ set_fs(KERNEL_DS);
+ n = do_sync_write(&f, data, size, &f.f_pos);
+ set_fs(old_fs);
+
+ fsnotify_modify(&f);
+
+ return n == size;
+}
+
+/*
+ * Make a regular file in the root directory of our file system.
+ * Load it up with "size" bytes of data from "buf".
+ * Set the mtime & ctime to the date that this record was originally stored.
+ */
+int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
+ char *data, size_t size,
+ struct timespec time, int (*erase)(u64))
+{
+ struct dentry *root = pstore_sb->s_root;
+ struct dentry *dentry;
+ struct inode *inode;
+ int rc;
+ char name[PSTORE_NAMELEN];
+ struct pstore_private *private;
+
+ rc = -ENOMEM;
+ inode = pstore_get_inode(pstore_sb, root->d_inode, S_IFREG | 0444, 0);
+ if (!inode)
+ goto fail;
+ inode->i_uid = inode->i_gid = 0;
+ private = kmalloc(sizeof *private, GFP_KERNEL);
+ if (!private)
+ goto fail_alloc;
+ private->id = id;
+ private->erase = erase;
+
+ switch (type) {
+ case PSTORE_TYPE_DMESG:
+ sprintf(name, "dmesg-%s-%lld", psname, id);
+ break;
+ case PSTORE_TYPE_MCE:
+ sprintf(name, "mce-%s-%lld", psname, id);
+ break;
+ case PSTORE_TYPE_UNKNOWN:
+ sprintf(name, "unknown-%s-%lld", psname, id);
+ break;
+ default:
+ sprintf(name, "type%d-%s-%lld", type, psname, id);
+ break;
+ }
+
+ mutex_lock(&root->d_inode->i_mutex);
+
+ rc = -ENOSPC;
+ dentry = d_alloc_name(root, name);
+ if (IS_ERR(dentry))
+ goto fail_lockedalloc;
+
+ d_add(dentry, inode);
+
+ mutex_unlock(&root->d_inode->i_mutex);
+
+ if (!pstore_writefile(inode, dentry, data, size))
+ goto fail_write;
+
+ inode->i_private = private;
+
+ if (time.tv_sec)
+ inode->i_mtime = inode->i_ctime = time;
+
+ return 0;
+
+fail_write:
+ kfree(private);
+ inode->i_nlink--;
+ mutex_lock(&root->d_inode->i_mutex);
+ d_delete(dentry);
+ dput(dentry);
+ mutex_unlock(&root->d_inode->i_mutex);
+ goto fail;
+
+fail_lockedalloc:
+ mutex_unlock(&root->d_inode->i_mutex);
+ kfree(private);
+fail_alloc:
+ iput(inode);
+
+fail:
+ return rc;
+}
+
+int pstore_fill_super(struct super_block *sb, void *data, int silent)
+{
+ struct inode *inode = NULL;
+ struct dentry *root;
+ int err;
+
+ save_mount_options(sb, data);
+
+ pstore_sb = sb;
+
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+ sb->s_blocksize = PAGE_CACHE_SIZE;
+ sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ sb->s_magic = PSTOREFS_MAGIC;
+ sb->s_op = &pstore_ops;
+ sb->s_time_gran = 1;
+
+ inode = pstore_get_inode(sb, NULL, S_IFDIR | 0755, 0);
+ if (!inode) {
+ err = -ENOMEM;
+ goto fail;
+ }
+ /* override ramfs "dir" options so we catch unlink(2) */
+ inode->i_op = &pstore_dir_inode_operations;
+
+ root = d_alloc_root(inode);
+ sb->s_root = root;
+ if (!root) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ pstore_get_records();
+
+ return 0;
+fail:
+ iput(inode);
+ return err;
+}
+
+static int pstore_get_sb(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+{
+ struct dentry *root;
+
+ root = mount_nodev(fs_type, flags, data, pstore_fill_super);
+ if (IS_ERR(root))
+ return -ENOMEM;
+
+ mnt->mnt_root = root;
+ mnt->mnt_sb = root->d_sb;
+ pstore_mnt = mnt;
+
+ return 0;
+}
+
+static void pstore_kill_sb(struct super_block *sb)
+{
+ kill_litter_super(sb);
+ pstore_sb = NULL;
+ pstore_mnt = NULL;
+}
+
+static struct file_system_type pstore_fs_type = {
+ .name = "pstore",
+ .get_sb = pstore_get_sb,
+ .kill_sb = pstore_kill_sb,
+};
+
+static int __init init_pstore_fs(void)
+{
+ int ret = 0;
+ struct kobject *pstorefs_kobj;
+
+ pstorefs_kobj = kobject_create_and_add("pstore", fs_kobj);
+ if (!pstorefs_kobj)
+ return -ENOMEM;
+
+ sysfs_create_file(pstorefs_kobj, &pstore_kmsg_bytes_attr.attr);
+
+ ret = register_filesystem(&pstore_fs_type);
+
+ if (ret) {
+ sysfs_remove_file(pstorefs_kobj, &pstore_kmsg_bytes_attr.attr);
+ kobject_put(pstorefs_kobj);
+ }
+
+ return ret;
+}
+module_init(init_pstore_fs)
+
+MODULE_AUTHOR("Tony Luck <tony.luck@xxxxxxxxx>");
+MODULE_LICENSE("GPL");
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
new file mode 100644
index 0000000..76c26d2
--- /dev/null
+++ b/fs/pstore/internal.h
@@ -0,0 +1,7 @@
+extern void pstore_get_records(void);
+extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id,
+ char *data, size_t size,
+ struct timespec time, int (*erase)(u64));
+extern int pstore_is_mounted(void);
+
+extern struct kobj_attribute pstore_kmsg_bytes_attr;
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
new file mode 100644
index 0000000..705fdf8
--- /dev/null
+++ b/fs/pstore/platform.c
@@ -0,0 +1,202 @@
+/*
+ * Persistent Storage - platform driver interface parts.
+ *
+ * Copyright (C) 2010 Intel Corporation <tony.luck@xxxxxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/atomic.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/kmsg_dump.h>
+#include <linux/module.h>
+#include <linux/pstore.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include "internal.h"
+
+/*
+ * pstore_lock just protects "psinfo" during
+ * calls to pstore_register()
+ */
+static DEFINE_SPINLOCK(pstore_lock);
+static struct pstore_info *psinfo;
+
+/* How much of the console log to snapshot. /sys/fs/pstore/kmsg_bytes */
+static unsigned long kmsg_bytes = 10240;
+
+static ssize_t b_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%lu\n", kmsg_bytes);
+}
+
+static ssize_t b_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ return (sscanf(buf, "%lu", &kmsg_bytes) > 0) ? count : 0;
+}
+
+struct kobj_attribute pstore_kmsg_bytes_attr =
+ __ATTR(kmsg_bytes, S_IRUGO | S_IWUSR, b_show, b_store);
+
+/* Tag each group of saved records with a sequence number */
+static int oopscount;
+
+/*
+ * callback from kmsg_dump. (s2,l2) has the most recently
+ * written bytes, older bytes are in (s1,l1). Save as much
+ * as we can from the end of the buffer.
+ */
+static void pstore_dump(struct kmsg_dumper *dumper,
+ enum kmsg_dump_reason reason,
+ const char *s1, unsigned long l1,
+ const char *s2, unsigned long l2)
+{
+ unsigned long s1_start, s2_start;
+ unsigned long l1_cpy, l2_cpy;
+ unsigned long size, total = 0;
+ char *dst;
+ u64 id;
+ int hsize, part = 1;
+
+ mutex_lock(&psinfo->buf_mutex);
+ oopscount++;
+ while (total < kmsg_bytes) {
+ dst = psinfo->buf;
+ hsize = sprintf(dst, "Oops#%d Part%d\n", oopscount, part++);
+ size = psinfo->bufsize - hsize;
+ dst += hsize;
+
+ l2_cpy = min(l2, size);
+ l1_cpy = min(l1, size - l2_cpy);
+
+ if (l1_cpy + l2_cpy == 0)
+ break;
+
+ s2_start = l2 - l2_cpy;
+ s1_start = l1 - l1_cpy;
+
+ memcpy(dst, s1 + s1_start, l1_cpy);
+ memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy);
+
+ id = psinfo->write(PSTORE_TYPE_DMESG, hsize + l1_cpy + l2_cpy);
+ if (pstore_is_mounted())
+ pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id,
+ psinfo->buf, hsize + l1_cpy + l2_cpy,
+ CURRENT_TIME, psinfo->erase);
+ l1 -= l1_cpy;
+ l2 -= l2_cpy;
+ total += l1_cpy + l2_cpy;
+ }
+ mutex_unlock(&psinfo->buf_mutex);
+}
+
+static struct kmsg_dumper pstore_dumper = {
+ .dump = pstore_dump,
+};
+
+/*
+ * platform specific persistent storage driver registers with
+ * us here. If pstore is already mounted, call the platform
+ * read function right away to populate the file system. If not
+ * then the pstore mount code will call us later to fill out
+ * the file system.
+ *
+ * Register with kmsg_dump to save last part of console log on panic.
+ */
+int pstore_register(struct pstore_info *psi)
+{
+ struct module *owner = psi->owner;
+
+ spin_lock(&pstore_lock);
+ if (psinfo) {
+ spin_unlock(&pstore_lock);
+ return -EBUSY;
+ }
+ psinfo = psi;
+ spin_unlock(&pstore_lock);
+
+ if (owner && !try_module_get(owner)) {
+ psinfo = NULL;
+ return -EINVAL;
+ }
+
+ if (pstore_is_mounted())
+ pstore_get_records();
+
+ kmsg_dump_register(&pstore_dumper);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(pstore_register);
+
+/*
+ * Read all the records from the persistent store. Create and
+ * file files in our filesystem.
+ */
+void pstore_get_records(void)
+{
+ struct pstore_info *psi = psinfo;
+ size_t size;
+ u64 id;
+ enum pstore_type_id type;
+ struct timespec time;
+ int failed = 0;
+
+ if (!psi)
+ return;
+
+ mutex_lock(&psinfo->buf_mutex);
+ while ((size = psi->read(&id, &type, &time)) > 0) {
+ if (pstore_mkfile(type, psi->name, id, psi->buf, size,
+ time, psi->erase))
+ failed++;
+ }
+ mutex_unlock(&psinfo->buf_mutex);
+
+ if (failed)
+ printk(KERN_WARNING "pstore: failed to load %d record(s) from '%s'\n",
+ failed, psi->name);
+}
+
+/*
+ * Call platform driver to write a record to the
+ * persistent store.
+ */
+int pstore_write(enum pstore_type_id type, char *buf, size_t size)
+{
+ u64 id;
+
+ if (!psinfo)
+ return -ENODEV;
+
+ if (size > psinfo->bufsize)
+ return -EFBIG;
+
+ mutex_lock(&psinfo->buf_mutex);
+ memcpy(psinfo->buf, buf, size);
+ id = psinfo->write(type, size);
+ if (pstore_is_mounted())
+ pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf,
+ size, CURRENT_TIME, psinfo->erase);
+ mutex_unlock(&psinfo->buf_mutex);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(pstore_write);
diff --git a/include/linux/magic.h b/include/linux/magic.h
index ff690d0..e87fd5a 100644
--- a/include/linux/magic.h
+++ b/include/linux/magic.h
@@ -26,6 +26,7 @@
#define ISOFS_SUPER_MAGIC 0x9660
#define JFFS2_SUPER_MAGIC 0x72b6
#define ANON_INODE_FS_MAGIC 0x09041934
+#define PSTOREFS_MAGIC 0x6165676C

#define MINIX_SUPER_MAGIC 0x137F /* original minix fs */
#define MINIX_SUPER_MAGIC2 0x138F /* minix fs, 30 char names */
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
new file mode 100644
index 0000000..4197773
--- /dev/null
+++ b/include/linux/pstore.h
@@ -0,0 +1,60 @@
+/*
+ * Persistent Storage - pstore.h
+ *
+ * Copyright (C) 2010 Intel Corporation <tony.luck@xxxxxxxxx>
+ *
+ * This code is the generic layer to export data records from platform
+ * level persistent storage via a file system.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _LINUX_PSTORE_H
+#define _LINUX_PSTORE_H
+
+/* types */
+enum pstore_type_id {
+ PSTORE_TYPE_DMESG = 0,
+ PSTORE_TYPE_MCE = 1,
+ PSTORE_TYPE_UNKNOWN = 255
+};
+
+struct pstore_info {
+ struct module *owner;
+ char *name;
+ struct mutex buf_mutex; /* serialize access to 'buf' */
+ char *buf;
+ size_t bufsize;
+ size_t (*read)(u64 *id, enum pstore_type_id *type,
+ struct timespec *time);
+ u64 (*write)(enum pstore_type_id type, size_t size);
+ int (*erase)(u64 id);
+};
+
+#ifdef CONFIG_PSTORE
+extern int pstore_register(struct pstore_info *);
+extern int pstore_write(enum pstore_type_id type, char *buf, size_t size);
+#else
+static inline int
+pstore_register(struct pstore_info *psi)
+{
+ return -ENODEV;
+}
+static inline int
+pstore_write(enum pstore_type_id type, char *buf, size_t size)
+{
+ return -ENODEV;
+}
+#endif
+
+#endif /*_LINUX_PSTORE_H*/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/