Re: [RFC PATCH 1/2] mm, tree wide: replace __GFP_REPEAT by __GFP_RETRY_HARD with more useful semantic

From: Tetsuo Handa
Date: Tue Jun 14 2016 - 07:12:19 EST


Michal Hocko wrote:
> > > > That _somebody_ might release oom_lock without invoking the OOM killer (e.g.
> > > > doing !__GFP_FS allocation), which means that we have reached the OOM condition
> > > > and nobody is actually handling the OOM on our behalf. __GFP_RETRY_HARD becomes
> > > > as weak as __GFP_NORETRY. I think this is a regression.
> > >
> > > I really fail to see your point. We are talking about a gfp flag which
> > > tells the allocator to retry as much as it is feasible. Getting through
> > > all the reclaim attempts two times without any progress sounds like a
> > > fair criterion. Well, we could try $NUM times but that wouldn't make too
> > > much difference to what you are writing above. The fact whether somebody
> > > has been killed or not is not really that important IMHO.
> >
> > If all the reclaim attempt first time made no progress, all the reclaim
> > attempt second time unlikely make progress unless the OOM killer kills
> > something. Thus, doing all the reclaim attempts two times without any progress
> > without killing somebody sounds almost equivalent to doing all the reclaim
> > attempt only once.
>
> Yes, that is possible. You might have a GFP_NOFS only load where nothing
> really invokes the OOM killer. Does that actually matter, though? The
> semantic of the flag is to retry hard while the page allocator believes
> it can make a forward progress. But not for ever. We never know whether
> a progress is possible at all. We have certain heuristics when to give
> up, try to invoke OOM killer and try again hoping things have changed.
> This is not much different except we declare that no hope to getting to
> the OOM point again without being able to succeed. Are you suggesting
> a more precise heuristic? Or do you claim that we do not need a flag
> which would put a middle ground between __GFP_NORETRY and __GFP_NOFAIL
> which are on the extreme sides?

Well, maybe we can get rid of __GFP_RETRY (or make __GFP_RETRY used for only
huge pages). Many __GFP_RETRY users are ready to fall back to vmalloc().

We are not sure whether such __GFP_RETRY users want to retry with OOM-killing
somebody (we don't have __GFP_MAY_OOM_KILL which explicitly asks for "retry
with OOM-killing somebody").

If __GFP_RETRY means nothing but try once more,

void *n = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
if (!n)
n = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);

will emulate it.



----- arch/powerpc/include/asm/book3s/64/pgalloc.h -----

static inline pgd_t *radix__pgd_alloc(struct mm_struct *mm)
{
#ifdef CONFIG_PPC_64K_PAGES
return (pgd_t *)__get_free_page(PGALLOC_GFP);
#else
struct page *page;
page = alloc_pages(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO | __GFP_REPEAT, 4);
if (!page)
return NULL;
return (pgd_t *) page_address(page);
#endif
}

----- arch/powerpc/kvm/book3s_64_mmu_hv.c -----

kvm->arch.hpt_cma_alloc = 0;
page = kvm_alloc_hpt(1ul << (order - PAGE_SHIFT));
if (page) {
hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
memset((void *)hpt, 0, (1ul << order));
kvm->arch.hpt_cma_alloc = 1;
}

/* Lastly try successively smaller sizes from the page allocator */
/* Only do this if userspace didn't specify a size via ioctl */
while (!hpt && order > 18 && !htab_orderp) {
hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|
__GFP_NOWARN, order - PAGE_SHIFT);
if (!hpt)
--order;
}

if (!hpt)
return -ENOMEM;

----- drivers/vhost/vhost.c -----

static void *vhost_kvzalloc(unsigned long size)
{
void *n = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);

if (!n)
n = vzalloc(size);
return n;
}

----- drivers/vhost/scsi.c -----

vs = kzalloc(sizeof(*vs), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
if (!vs) {
vs = vzalloc(sizeof(*vs));
if (!vs)
goto err_vs;
}

----- drivers/vhost/net.c -----

n = kmalloc(sizeof *n, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
if (!n) {
n = vmalloc(sizeof *n);
if (!n)
return -ENOMEM;
}

----- drivers/block/xen-blkfront.c -----

/* Stage 1: Make a safe copy of the shadow state. */
copy = kmemdup(rinfo->shadow, sizeof(rinfo->shadow),
GFP_NOIO | __GFP_REPEAT | __GFP_HIGH);
if (!copy)
return -ENOMEM;

----- drivers/mmc/host/wbsd.c -----

/*
* We need to allocate a special buffer in
* order for ISA to be able to DMA to it.
*/
host->dma_buffer = kmalloc(65536,
GFP_NOIO | GFP_DMA | __GFP_REPEAT | __GFP_NOWARN);
if (!host->dma_buffer)
goto free;

----- drivers/target/target_core_transport.c -----

se_sess->sess_cmd_map = kzalloc(tag_num * tag_size,
GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
if (!se_sess->sess_cmd_map) {
se_sess->sess_cmd_map = vzalloc(tag_num * tag_size);
if (!se_sess->sess_cmd_map) {
pr_err("Unable to allocate se_sess->sess_cmd_map\n");
return -ENOMEM;
}
}

----- drivers/s390/char/vmcp.c -----

if (mutex_lock_interruptible(&session->mutex)) {
kfree(cmd);
return -ERESTARTSYS;
}
if (!session->response)
session->response = (char *)__get_free_pages(GFP_KERNEL
| __GFP_REPEAT | GFP_DMA,
get_order(session->bufsize));
if (!session->response) {
mutex_unlock(&session->mutex);
kfree(cmd);
return -ENOMEM;
}

----- fs/btrfs/raid56.c -----

table_size = sizeof(*table) + sizeof(*h) * num_entries;
table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
if (!table) {
table = vzalloc(table_size);
if (!table)
return -ENOMEM;
}

----- fs/btrfs/check-integrity.c -----

state = kzalloc(sizeof(*state), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
if (!state) {
state = vzalloc(sizeof(*state));
if (!state) {
printk(KERN_INFO "btrfs check-integrity: vzalloc() failed!\n");
return -1;
}
}

----- mm/sparse-vmemmap.c -----

void * __meminit vmemmap_alloc_block(unsigned long size, int node)
{
/* If the main allocator is up use that, fallback to bootmem. */
if (slab_is_available()) {
struct page *page;

if (node_state(node, N_HIGH_MEMORY))
page = alloc_pages_node(
node, GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT,
get_order(size));
else
page = alloc_pages(
GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT,
get_order(size));
if (page)
return page_address(page);
return NULL;
} else
return __earlyonly_bootmem_alloc(node, size, size,
__pa(MAX_DMA_ADDRESS));
}

----- mm/hugetlb.c -----

static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
{
struct page *page;

page = __alloc_pages_node(nid,
htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
__GFP_REPEAT|__GFP_NOWARN,
huge_page_order(h));
if (page) {
prep_new_huge_page(h, page, nid);
}

return page;
}

static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr, int nid)
{
int order = huge_page_order(h);
gfp_t gfp = htlb_alloc_mask(h)|__GFP_COMP|__GFP_REPEAT|__GFP_NOWARN;
unsigned int cpuset_mems_cookie;

----- net/core/skbuff.c -----

gfp_head = gfp_mask;
if (gfp_head & __GFP_DIRECT_RECLAIM)
gfp_head |= __GFP_REPEAT;

*errcode = -ENOBUFS;
skb = alloc_skb(header_len, gfp_head);
if (!skb)
return NULL;

----- net/core/dev.c -----

rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
if (!rx) {
rx = vzalloc(sz);
if (!rx)
return -ENOMEM;
}
dev->_rx = rx;

tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
if (!tx) {
tx = vzalloc(sz);
if (!tx)
return -ENOMEM;
}
dev->_tx = tx;

p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
if (!p)
p = vzalloc(alloc_size);
if (!p)
return NULL;

----- net/sched/sch_fq.c -----

static void *fq_alloc_node(size_t sz, int node)
{
void *ptr;

ptr = kmalloc_node(sz, GFP_KERNEL | __GFP_REPEAT | __GFP_NOWARN, node);
if (!ptr)
ptr = vmalloc_node(sz, node);
return ptr;
}