Re: [PATCH v8 20/43] arm64: RME: Runtime faulting of memory

From: Suzuki K Poulose
Date: Mon May 19 2025 - 13:36:17 EST


Hi Steven

On 16/04/2025 14:41, Steven Price wrote:
At runtime if the realm guest accesses memory which hasn't yet been
mapped then KVM needs to either populate the region or fault the guest.

For memory in the lower (protected) region of IPA a fresh page is
provided to the RMM which will zero the contents. For memory in the
upper (shared) region of IPA, the memory from the memslot is mapped
into the realm VM non secure.

Signed-off-by: Steven Price <steven.price@xxxxxxx>

Please find some comments below.

...

diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c
index f6af3ea6ea8a..b6959cd17a6c 100644
--- a/arch/arm64/kvm/rme.c
+++ b/arch/arm64/kvm/rme.c
@@ -714,6 +714,186 @@ static int realm_create_protected_data_page(struct realm *realm,
return -ENXIO;
}
+static int fold_rtt(struct realm *realm, unsigned long addr, int level)
+{
+ phys_addr_t rtt_addr;
+ int ret;
+
+ ret = realm_rtt_fold(realm, addr, level, &rtt_addr);
+ if (ret)
+ return ret;
+
+ free_delegated_granule(rtt_addr);
+
+ return 0;
+}
+
+int realm_map_protected(struct realm *realm,
+ unsigned long ipa,
+ kvm_pfn_t pfn,
+ unsigned long map_size,
+ struct kvm_mmu_memory_cache *memcache)
+{
+ phys_addr_t phys = __pfn_to_phys(pfn);
+ phys_addr_t rd = virt_to_phys(realm->rd);
+ unsigned long base_ipa = ipa;
+ unsigned long size;
+ int map_level;
+ int ret = 0;
+
+ if (WARN_ON(!IS_ALIGNED(map_size, RMM_PAGE_SIZE)))
+ return -EINVAL;
+
+ if (WARN_ON(!IS_ALIGNED(ipa, map_size)))
+ return -EINVAL;
+
+ if (IS_ALIGNED(map_size, RMM_L2_BLOCK_SIZE))
+ map_level = 2;

minor nit : RMM_RTT_BLOCK_LEVEL

+ else
+ map_level = 3;

minor nit: RMM_RTT_MAX_LEVEL ?

+
+ if (map_level < RMM_RTT_MAX_LEVEL) {
+ /*
+ * A temporary RTT is needed during the map, precreate it,
+ * however if there is an error (e.g. missing parent tables)
+ * this will be handled below.
+ */
+ realm_create_rtt_levels(realm, ipa, map_level,
+ RMM_RTT_MAX_LEVEL, memcache);
+ }
+
+ for (size = 0; size < map_size; size += RMM_PAGE_SIZE) {
+ if (rmi_granule_delegate(phys)) {
+ /*
+ * It's likely we raced with another VCPU on the same
+ * fault. Assume the other VCPU has handled the fault
+ * and return to the guest.
+ */
+ return 0;
+ }
+
+ ret = rmi_data_create_unknown(rd, phys, ipa);
+
+ if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) {
+ /* Create missing RTTs and retry */
+ int level = RMI_RETURN_INDEX(ret);
+
+ WARN_ON(level == RMM_RTT_MAX_LEVEL);
+
+ ret = realm_create_rtt_levels(realm, ipa, level,
+ RMM_RTT_MAX_LEVEL,
+ memcache);
+ if (ret)
+ goto err_undelegate;
+
+ ret = rmi_data_create_unknown(rd, phys, ipa);
+ }
+
+ if (WARN_ON(ret))
+ goto err_undelegate;
+
+ phys += RMM_PAGE_SIZE;
+ ipa += RMM_PAGE_SIZE;
+ }
+
+ if (map_size == RMM_L2_BLOCK_SIZE) {
+ ret = fold_rtt(realm, base_ipa, map_level + 1);
+ if (WARN_ON(ret))
+ goto err;
+ }
+
+ return 0;
+
+err_undelegate:
+ if (WARN_ON(rmi_granule_undelegate(phys))) {
+ /* Page can't be returned to NS world so is lost */
+ get_page(phys_to_page(phys));
+ }
+err:
+ while (size > 0) {
+ unsigned long data, top;
+
+ phys -= RMM_PAGE_SIZE;
+ size -= RMM_PAGE_SIZE;
+ ipa -= RMM_PAGE_SIZE;
+
+ WARN_ON(rmi_data_destroy(rd, ipa, &data, &top));
+
+ if (WARN_ON(rmi_granule_undelegate(phys))) {
+ /* Page can't be returned to NS world so is lost */
+ get_page(phys_to_page(phys));
+ }
+ }
+ return -ENXIO;
+}
+
+int realm_map_non_secure(struct realm *realm,
+ unsigned long ipa,
+ kvm_pfn_t pfn,
+ unsigned long size,
+ struct kvm_mmu_memory_cache *memcache)
+{
+ phys_addr_t rd = virt_to_phys(realm->rd);
+ phys_addr_t phys = __pfn_to_phys(pfn);
+ unsigned long offset;
+ int map_size, map_level;
+ int ret = 0;
+
+ if (WARN_ON(!IS_ALIGNED(size, RMM_PAGE_SIZE)))
+ return -EINVAL;
+
+ if (WARN_ON(!IS_ALIGNED(ipa, size)))
+ return -EINVAL;
+
+ if (IS_ALIGNED(size, RMM_L2_BLOCK_SIZE)) {
+ map_level = 2;
+ map_size = RMM_L2_BLOCK_SIZE;

Same here, stick to the symbols than digits.

+ } else {
+ map_level = 3;
+ map_size = RMM_PAGE_SIZE;
+ }
+
+ for (offset = 0; offset < size; offset += map_size) {
+ /*
+ * realm_map_ipa() enforces that the memory is writable,

The function names seems to be obsolete, please fix.

+ * so for now we permit both read and write.
+ */
+ unsigned long desc = phys |
+ PTE_S2_MEMATTR(MT_S2_FWB_NORMAL) |
+ KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R |
+ KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
+ ret = rmi_rtt_map_unprotected(rd, ipa, map_level, desc);
+
+ if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) {

Could we hit the following case and end up in failure ?

* Initially a single page is shared and the S2 is mapped
* Later the Realm shares the entire L2 block and encounters a fault
at a new IPA within the L2 block.

In this case, we may try to L2 mapping when there is a L3 mapping and
we could encounter (RMI_ERROR_RTT, 2).


+ /* Create missing RTTs and retry */
+ int level = RMI_RETURN_INDEX(ret);

So we should probably go down the rtt create step, with the following
check.

if (level < map_level) {

+
+ ret = realm_create_rtt_levels(realm, ipa, level,
+ map_level, memcache);
+ if (ret)
+ return -ENXIO;
+
+ ret = rmi_rtt_map_unprotected(rd, ipa, map_level, desc);


} else {

Otherwise, may be we need to do some more hard work to fix it up.

1. If map_level == 3, something is terribly wrong or we raced with another thread ?

2. If map_level < 3 and we didn't race :

a. Going one level down and creating the mappings there and then folding. But we could endup dealing with ERROR_RTT,3 as in (1).

b. Easiest is to destroy the table at "map_level + 1" and retry the map.


Suzuki



+ }
+ /*
+ * RMI_ERROR_RTT can be reported for two reasons: either the
+ * RTT tables are not there, or there is an RTTE already
+ * present for the address. The call to
+ * realm_create_rtt_levels() above handles the first case, and
+ * in the second case this indicates that another thread has
+ * already populated the RTTE for us, so we can ignore the
+ * error and continue.
+ */
+ if (ret && RMI_RETURN_STATUS(ret) != RMI_ERROR_RTT)
+ return -ENXIO;
+
+ ipa += map_size;
+ phys += map_size;
+ }
+
+ return 0;
+}
+
static int populate_region(struct kvm *kvm,
phys_addr_t ipa_base,
phys_addr_t ipa_end,