linux-kernel - Re: [PATCH 2/3] zram: support page-based parallel write

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20161024044714.GA4938@blaptop>
Date:   Mon, 24 Oct 2016 13:47:14 +0900
From:   Minchan Kim <minchan@...nel.org>
To:     Sergey Senozhatsky <sergey.senozhatsky.work@...il.com>
Cc:     Minchan Kim <minchan@...nel.org>,
        Andrew Morton <akpm@...ux-foundation.org>,
        Sergey Senozhatsky <sergey.senozhatsky@...il.com>,
        linux-kernel@...r.kernel.org
Subject: Re: [PATCH 2/3] zram: support page-based parallel write

Hi Sergey,

On Fri, Oct 21, 2016 at 03:03:34PM +0900, Sergey Senozhatsky wrote:
> On (09/22/16 15:42), Minchan Kim wrote:
> > +static ssize_t use_aio_store(struct device *dev,
> > +		struct device_attribute *attr, const char *buf, size_t len)
> > +{
> > +	int ret;
> > +	u16 do_async;
> > +	struct zram *zram  = dev_to_zram(dev);
> > +
> > +	ret = kstrtou16(buf, 10, &do_async);
> > +	if (ret)
> > +		return ret;
> > +
> > +	down_write(&zram->init_lock);
> > +	if (init_done(zram)) {
> > +		up_write(&zram->init_lock);
> > +		pr_info("Can't change for initialized device\n");
> > +		return -EBUSY;
> > +	}
> > +
> > +	if (do_async)
> > +		zram->use_aio = true;
> > +	else
> > +		zram->use_aio = false;
> 
> a stupid nitpick:
> 	zram->use_aio = do_async?

Just matter of preference. I don't mind it.
Will use it in next spin.

> 
> > +	up_write(&zram->init_lock);
> > +
> > +	return len;
> > +}
> > +#endif
> > +
> [..]
> > +static void worker_wake_up(void)
> > +{
> > +	/*
> > +	 * Unless it's enough workes to handle accumulated requests,
> > +	 * wake up new workers.
> > +	 */
> > +	if (workers.nr_running * NR_BATCH_PAGES < workers.nr_req) {
> > +		int nr_wakeup = (workers.nr_req + NR_BATCH_PAGES) /
> > +				NR_BATCH_PAGES - workers.nr_running;
> > +
> > +		wake_up_nr(&workers.req_wait, nr_wakeup);
> > +	}
> > +}
> > +
> > +static void zram_unplug(struct blk_plug_cb *cb, bool from_schedule)
> > +{
> > +	spin_lock(&workers.req_lock);
> > +	if (workers.nr_req)
> > +		worker_wake_up();
> > +	spin_unlock(&workers.req_lock);
> > +	kfree(cb);
> > +}
> > +
> > +static int zram_check_plugged(void)
> > +{
> > +	return !!blk_check_plugged(zram_unplug, NULL,
> > +			sizeof(struct blk_plug_cb));
> > +}
> 
> I'm having some troubles understanding the purpose of zram_check_plugged().
> it's a global symbol, can you just use it directly? otherwise we are
> doing additional kmalloc/kfree, spin_lock/unlock and so on.

I don't understnad it. Why does it that use zram_check_plugged directly reduce
count things you mentioned?
> 
> what am I missing? current->plug? can it affect us? how?

Sorry. I can't understand your point.

> 
> 
> > +int queue_page_request(struct zram *zram, struct bio_vec *bvec, u32 index,
> > +                       int offset, bool write)
> 
> static int?

Yeb.

> 
> > +{
> > +       struct page_request *page_req = kmalloc(sizeof(*page_req), GFP_NOIO);
> > +
> > +       if (!page_req)
> > +               return -ENOMEM;
> 
> 
> ...
> 
> 
> > +int queue_page_request_list(struct zram *zram, struct bio_request *bio_req,
> 
> static int?

Yeb.

> 
> > +			struct bio_vec *bvec, u32 index, int offset,
> > +			bool write, struct list_head *page_list)
> > +{
> > +	struct page_request *page_req = kmalloc(sizeof(*page_req), GFP_NOIO);
> > +
> > +	if (!page_req) {
> > +		while (!list_empty(page_list)) {
> > +			page_req = list_first_entry(page_list,
> > +					struct page_request, list);
> > +			list_del(&page_req->list);
> > +			kfree(page_req);
> > +		}
> > +
> > +		return -ENOMEM;
> > +	}
> > +
> > +	page_req->bio_req = bio_req;
> > +	page_req->zram = zram;
> > +	page_req->bvec = *bvec;
> > +	page_req->index = index;
> > +	page_req->offset = offset;
> > +	page_req->write = write;
> > +	atomic_inc(&bio_req->nr_pages);
> > +
> > +	list_add_tail(&page_req->list, page_list);
> > +
> > +	return 0;
> > +}
> > +
> > +/*
> > + * @page_list: pages isolated from request queue
> > + */
> > +static void get_page_requests(struct list_head *page_list)
> > +{
> > +	struct page_request *page_req;
> > +	int nr_pages;
> > +
> > +	for (nr_pages = 0; nr_pages < NR_BATCH_PAGES; nr_pages++) {
> > +		if  (list_empty(&workers.req_list))
> > +			break;
> > +
> > +		page_req = list_first_entry(&workers.req_list,
> > +					struct page_request, list);
> > +		list_move(&page_req->list, page_list);
> > +	}
> > +
> > +	workers.nr_req -= nr_pages;
> > +}
> > +
> > +/*
> > + * move @page_list to request queue and wake up workers if it is necessary.
> > + */
> > +static void run_worker(struct bio *bio, struct list_head *page_list,
> > +			unsigned int nr_pages)
> > +{
> > +	WARN_ON(list_empty(page_list));
> > +
> > +	spin_lock(&workers.req_lock);
> > +	list_splice_tail(page_list, &workers.req_list);
> > +	workers.nr_req += nr_pages;
> > +	if (bio->bi_opf & REQ_SYNC || !zram_check_plugged())
> > +		worker_wake_up();
> > +	spin_unlock(&workers.req_lock);
> > +}
> > +
> > +static bool __zram_make_async_request(struct zram *zram, struct bio *bio)
> > +{
> > +	int offset;
> > +	u32 index;
> > +	struct bio_vec bvec;
> > +	struct bvec_iter iter;
> > +	LIST_HEAD(page_list);
> > +	struct bio_request *bio_req;
> > +	unsigned int nr_pages = 0;
> > +	bool write = op_is_write(bio_op(bio));
> > +
> > +	if (!zram->use_aio || !op_is_write(bio_op(bio)))
> > +		return false;
> > +
> > +	bio_req = kmalloc(sizeof(*bio_req), GFP_NOIO);
> > +	if (!bio_req)
> > +		return false;
> > +
> > +	index = bio->bi_iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
> > +	offset = (bio->bi_iter.bi_sector &
> > +		  (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
> > +
> > +	bio_req->bio = bio;
> > +	atomic_set(&bio_req->nr_pages, 0);
> > +
> > +	bio_for_each_segment(bvec, bio, iter) {
> > +		int max_transfer_size = PAGE_SIZE - offset;
> > +
> > +		if (bvec.bv_len > max_transfer_size) {
> > +			struct bio_vec bv;
> > +
> > +			bv.bv_page = bvec.bv_page;
> > +			bv.bv_len = max_transfer_size;
> > +			bv.bv_offset = bvec.bv_offset;
> > +
> > +			if (queue_page_request_list(zram, bio_req, &bv,
> > +					index, offset, write, &page_list))
> > +				goto fail;
> > +
> > +			bv.bv_len = bvec.bv_len - max_transfer_size;
> > +			bv.bv_offset += max_transfer_size;
> > +			if (queue_page_request_list(zram, bio_req, &bv,
> > +					index + 1, 0, write, &page_list))
> > +				goto fail;
> > +		} else {
> > +			if (queue_page_request_list(zram, bio_req, &bvec,
> > +					index, offset, write, &page_list))
> > +				goto fail;
> > +		}
> > +
> > +		nr_pages++;
> > +		update_position(&index, &offset, &bvec);
> > +	}
> > +
> > +	run_worker(bio, &page_list, nr_pages);
> > +
> > +	return true;
> > +fail:
> > +	kfree(bio_req);
> > +
> > +	WARN_ON(!list_empty(&page_list));
> > +
> > +	return false;
> > +}
> > +
> > +void page_requests_rw(struct list_head *page_list)
> > +{
> > +	struct page_request *page_req;
> > +	bool write;
> > +	struct page *page;
> > +	struct zram *zram;
> > +	int err;
> > +	bool free_bio;
> > +	struct bio_request *bio_req;
> > +
> > +	while (!list_empty(page_list)) {
> > +		free_bio = false;
> > +		page_req = list_last_entry(page_list, struct page_request,
> > +					list);
> > +		write = page_req->write;
> > +		page = page_req->bvec.bv_page;
> > +		zram = page_req->zram;
> > +		bio_req = page_req->bio_req;
> > +		if (bio_req && atomic_dec_and_test(&bio_req->nr_pages))
> > +			free_bio = true;
> > +		list_del(&page_req->list);
> > +
> > +		err = zram_bvec_rw(zram, &page_req->bvec, page_req->index,
> > +			page_req->offset, page_req->write);
> > +
> > +		kfree(page_req);
> > +
> > +		/* page-based request */
> > +		if (!bio_req) {
> > +			page_endio(page, write, err);
> > +			zram_meta_put(zram);
> 
> who is calling zram_meta_get()? I mean in this loop...
> what if the loop continues after that `if'?

In this loop, anybody doesn't call zram_meta_get.
It should be called by IO queuing routine. !bio_req case above is rw_page thing
so each page_req IO request holds a reference of zram_meta and
in here, service routine release the refernce after IO done.

> 
> > +		/* bio-based request */
> > +		} else if (free_bio) {
> > +			if (likely(!err))
> > +				bio_endio(bio_req->bio);
> > +			else
> > +				bio_io_error(bio_req->bio);
> > +			kfree(bio_req);
> > +			zram_meta_put(zram);
> 
> ditto, who is calling zram_meta_get()?
> what if the loop continue after that `if'?
> 
> 
> we do zram_meta_get() only once in zram_make_request(). but then
> put meta for every request in page_list, while a bio passed to
> __zram_make_async_request() can span across several requests in
> page_list. right?

You're right. bio-based request case should be fixed.

> 
> > +		}
> > +	}
> > +}
> > +
> > +static int zram_thread(void *data)
> > +{
> > +	DEFINE_WAIT(wait);
> > +	LIST_HEAD(page_list);
> > +
> > +	spin_lock(&workers.req_lock);
> > +	workers.nr_running++;
> > +
> > +	while (!kthread_should_stop()) {
> > +		if (list_empty(&workers.req_list)) {
> > +			prepare_to_wait_exclusive(&workers.req_wait, &wait,
> > +					TASK_INTERRUPTIBLE);
> > +			workers.nr_running--;
> > +			spin_unlock(&workers.req_lock);
> > +			schedule();
> > +			spin_lock(&workers.req_lock);
> > +			finish_wait(&workers.req_wait, &wait);
> > +			workers.nr_running++;
> > +			continue;
> > +		}
> > +
> > +		get_page_requests(&page_list);
> > +		if (list_empty(&page_list))
> > +			continue;
> > +
> > +		spin_unlock(&workers.req_lock);
> > +		page_requests_rw(&page_list);
> > +		cond_resched();
> > +		spin_lock(&workers.req_lock);
> > +	}
> > +
> > +	workers.nr_running--;
> > +	spin_unlock(&workers.req_lock);
> > +
> > +	return 0;
> > +}
> > +
> > +static void destroy_workers(void)
> > +{
> > +	struct zram_worker *worker;
> > +
> > +	while (!list_empty(&workers.worker_list)) {
> > +		worker = list_first_entry(&workers.worker_list,
> > +				struct zram_worker,
> > +				list);
> > +		kthread_stop(worker->task);
> > +		list_del(&worker->list);
> > +		kfree(worker);
> > +	}
> > +
> > +	WARN_ON(workers.nr_running);
> > +}
> > +
> > +static int create_workers(void)
> > +{
> > +	int i;
> > +	int nr_cpu = num_online_cpus();
> > +	struct zram_worker *worker;
> > +
> > +	INIT_LIST_HEAD(&workers.worker_list);
> > +	INIT_LIST_HEAD(&workers.req_list);
> > +	spin_lock_init(&workers.req_lock);
> > +	init_waitqueue_head(&workers.req_wait);
> > +
> > +	for (i = 0; i < nr_cpu; i++) {
> > +		worker = kmalloc(sizeof(*worker), GFP_KERNEL);
> > +		if (!worker)
> > +			goto error;
> > +
> > +		worker->task = kthread_run(zram_thread, NULL, "zramd-%d", i);
> > +		if (IS_ERR(worker->task)) {
> > +			kfree(worker);
> > +			goto error;
> > +		}
> > +
> > +		list_add(&worker->list, &workers.worker_list);
> > +	}
> > +
> > +	return 0;
> > +
> > +error:
> > +	destroy_workers();
> > +	return 1;
> > +}
> > +
> > +static int zram_rw_async_page(struct zram *zram,
> > +			struct bio_vec *bv, u32 index, int offset,
> > +			bool is_write)
> > +{
> > +	if (!zram->use_aio || !is_write)
> > +		return 1;
> > +
> > +	return queue_page_request(zram, bv, index, offset, is_write);
> > +}
> > +#else
> > +static bool __zram_make_async_request(struct zram *zram, struct bio *bio)
> > +{
> > +	return false;
> > +}
> > +
> > +static int zram_rw_async_page(struct zram *zram,
> > +			struct bio_vec *bv, u32 index, int offset,
> > +			bool is_write)
> > +{
> > +	return 1;
> > +}
> > +
> > +static int create_workers(void)
> > +{
> > +	return 0;
> > +}
> > +
> > +static void destroy_workers(void)
> > +{
> > +}
> > +#endif
> > +
> >  /*
> >   * Handler function for all zram I/O requests.
> >   */
> > @@ -954,6 +1380,9 @@ static blk_qc_t zram_make_request(struct request_queue *queue, struct bio *bio)
> >  		goto out;
> >  	}
> >  
> > +	if (__zram_make_async_request(zram, bio))
> > +		goto out;
> > +
> >  	__zram_make_sync_request(zram, bio);
> >  out:
> >  	return BLK_QC_T_NONE;
> > @@ -1028,8 +1457,12 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
> >  	bv.bv_len = PAGE_SIZE;
> >  	bv.bv_offset = 0;
> >  
> > -	err = zram_rw_sync_page(bdev, zram, &bv, index, offset, is_write);
> > +	err = zram_rw_async_page(zram, &bv, index, offset, is_write);
> > +	if (!err)
> > +		goto out;
> >  
> > +	err = zram_rw_sync_page(bdev, zram, &bv, index, offset, is_write);
> > +out:
> >  	return err;
> >  }
> >  
> > @@ -1066,7 +1499,6 @@ static void zram_reset_device(struct zram *zram)
> >  	/* Reset stats */
> >  	memset(&zram->stats, 0, sizeof(zram->stats));
> >  	zram->disksize = 0;
> > -
> >  	set_capacity(zram->disk, 0);
> >  	part_stat_set_all(&zram->disk->part0, 0);
> >  
> > @@ -1211,6 +1643,9 @@ static DEVICE_ATTR_RW(mem_limit);
> >  static DEVICE_ATTR_RW(mem_used_max);
> >  static DEVICE_ATTR_RW(max_comp_streams);
> >  static DEVICE_ATTR_RW(comp_algorithm);
> > +#ifdef CONFIG_ZRAM_ASYNC_IO
> > +static DEVICE_ATTR_RW(use_aio);
> > +#endif
> 
> hm... no real objection, but exporing this sysfs attr can be very hacky
> and difficult for people...

We have been used sysfs for tune the zram for a long time.
Please suggest ideas if you have better. :)

Thanks for reviewing, Sergey!