This article is the 7th day of Linux Advent Calendar 2019.
Uh, I'm sorry for the low level, but this is my ability! !!
I usually write various low-level programs at a certain home appliance maker. Work is from no OS to Linux ...
I was a little worried, so I thought I would briefly summarize the implementation of NVDIMM.
It's a non-volatile DIMM, that is, a "non-volatile DIMM".
[NVDIMM] recommendation](https://qiita.com/sodium334/items/7e3719ec08b3d3a3fbc7) is very easy to understand!
@ YasunoriGoto1 has put together some very responsible information about NVDIMMs. (If I found it before I started writing, it was a different story for this article ...)
https://qiita.com/YasunoriGoto1/items/177c7a5b22a02d087ebf
Non-volatile memory (NVDIMM) and Linux support trends
This time, I will pick up and read this code only where it seems to be fun. Let's treat it as a block device, isn't it?
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/drivers/nvdimm?h=linux-5.3.y
The NVDIMM implementation of the linux kernel is scattered not only in root / drivers / nvdimm, but also in drivers / acpi / nfit. Let's throw away the faint expectation that "just read a little, you may understand!"
With that said, I would appreciate it if you could see it with a warm eye.
First, if you classify the structure based on the Makefile, it looks like this.
The procedure that the Linux kernel recognizes as a block device is as follows.
blk.c
module_init(nd_blk_init)
⇒ nd_driver_register(&nd_blk_driver);
⇒ nd_blk_driver = { .probe = nd_blk_probe, ... }
⇒ nsblk_attach_disk(nsblk)
static int nsblk_attach_disk(struct nd_namespace_blk *nsblk)
struct request_queue *q;
q = blk_alloc_queue(GFP_KERNEL);
if (devm_add_action_or_reset(dev, nd_blk_release_queue, q))
return -ENOMEN;
blk_queue_make_request(q, nd_blk_make_request); // (1)
blk_queue_max_hw_sectors(q, UINT_MAX); // (2)
blk_queue_logical_block_size(q, nsblk_sector_size(nsblk));
q->queuedata = nsblk;
set_capacity(disk, available_disk_size >> SECTOR_SHIFT);
device_add_disk(dev, disk, NULL);
(1)nd_blk_make_request()
The contents of static blk_qc_t nd_blk_make_request (struct request_queue * q, struct bio * bio) looks like this
do_acct = nd_iostat_start(bio, &start);
bio_for_each_segment(bvec, bio, iter) {
err = nsblk_do_bvec(nsblk, bip, bvec.bv_page, len,
bvec.bv_offset, rw, iter.bi_sector);
if (do_acct)
nd_iostat_end(bio, start);
The contents of nd_iostat_start () and nd_iostat_end () aren't doing anything new (https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/drivers/nvdimm) /nd.h?h=linux-5.3.y) Omit the explanation ... The point is nsblk_do_bvec () no matter how you look at it!
static int nsblk_do_bvec(struct nd_namespace_blk *nsblk,
struct bio_integrity_payload *bip, struct page *page,
unsigned int len, unsigned int off, int rw, sector_t sector)
{
while (len) {
unsigned int cur_len;
iobuf = kmap_atomic(page);
err = ndbr->do_io(ndbr, dev_offset, iobuf + off, cur_len, rw);
kunmap_atomic(iobuf);
if (bip) {
err = nd_blk_rw_integrity(nsblk, bip, lba, rw);
if (err)
return err;
}
len -= cur_len;
off += cur_len;
sector += sector_size >> SECTOR_SHIFT;
}
return err;
}
Where is this do_io from?
nd_region_create()
⇒ to_blk_region_desc(ndr_desc);
⇒ ndbr->do_io = ndbr_desc->do_io;
It's connected in drivers / acpi / nfit / core.c ... After doing that, the Write / Read command for blk was finally replaced by a memory copy! Now you have a way to access it as a block device.
acpi_desc->blk_do_io = acpi_nfit_blk_region_do_io;
ndbr_desc->enable = acpi_nfit_blk_region_enable;
ndbr_desc->do_io = acpi_desc->blk_do_io;
static int acpi_nfit_blk_region_do_io(struct nd_blk_region *ndbr,
resource_size_t dpa, void *iobuf, u64 len, int rw)
{
lane = nd_region_acquire_lane(nd_region);
while (len) {
u64 c = min(len, mmio->size);
rc = acpi_nfit_blk_single_io(nfit_blk, dpa + copied,
iobuf + copied, c, rw, lane);
if (rc)
break;
copied += c;
len -= c;
}
nd_region_release_lane(nd_region, lane);
return rc;
}
static int acpi_nfit_blk_single_io(struct nfit_blk *nfit_blk,
resource_size_t dpa, void *iobuf, size_t len, int rw,
unsigned int lane)
{
write_blk_ctl(nfit_blk, lane, dpa, len, rw);
while (len) {
<Omitted>
if (rw)
memcpy_flushcache(mmio->addr.aperture + offset, iobuf + copied, c);
else {
if (nfit_blk->dimm_flags & NFIT_BLK_READ_FLUSH)
arch_invalidate_pmem((void __force *)
mmio->addr.aperture + offset, c);
memcpy(iobuf + copied, mmio->addr.aperture + offset, c);
}
copied += c;
len -= c;
}
if (rw)
nvdimm_flush(nfit_blk->nd_region, NULL);
rc = read_blk_stat(nfit_blk, lane) ? -EIO : 0;
return rc;
}
It says blk_queue_max_hw_sectors (q, UINT_MAX);
, but since NVDIMM has no particular management unit, there is no upper limit on the number of sectors. I think that the.
blk_queue_logical_block_size (q, nsblk_sector_size (nsblk));
should get the capacity and set the size ...
static u32 nsblk_sector_size(struct nd_namespace_blk *nsblk)
{
return nsblk->lbasize - nsblk_meta_size(nsblk);
}
nsblk->lbasize = __le64_to_cpu(nd_label->lbasize);
nd_label->lbasize = __cpu_to_le64(nspm->lbasize);
nspm->lbasize = __le64_to_cpu(label0->lbasize);
??? I didn't understand ... Perth!
that? block_device_operations is ...
blk.c
static const struct block_device_operations nd_blk_fops = {
.owner = THIS_MODULE,
.revalidate_disk = nvdimm_revalidate_disk,
};
Can't access this? So, let's read a little more.
blk.c
static int nd_blk_probe(struct device *dev)
{
struct nd_namespace_common *ndns;
struct nd_namespace_blk *nsblk;
ndns = nvdimm_namespace_common_probe(dev);
if (IS_ERR(ndns))
return PTR_ERR(ndns);
nsblk = to_nd_namespace_blk(&ndns->dev);
nsblk->size = nvdimm_namespace_capacity(ndns);
dev_set_drvdata(dev, nsblk);
ndns->rw_bytes = nsblk_rw_bytes;
if (is_nd_btt(dev))
return nvdimm_namespace_attach_btt(ndns);
else if (nd_btt_probe(dev, ndns) == 0) {
/* we'll come back as btt-blk */
return -ENXIO;
} else
return nsblk_attach_disk(nsblk);
}
btt.c
int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns)
{
struct nd_btt *nd_btt = to_nd_btt(ndns->claim);
struct nd_region *nd_region;
struct btt_sb *btt_sb;
struct btt *btt;
size_t rawsize;
<Omitted>
nd_region = to_nd_region(nd_btt->dev.parent);
btt = btt_init(nd_btt, rawsize, nd_btt->lbasize, nd_btt->uuid,
nd_region);
if (!btt)
return -ENOMEM;
nd_btt->btt = btt;
return 0;
}
static struct btt *btt_init(struct nd_btt *nd_btt, unsigned long long rawsize,
u32 lbasize, u8 *uuid, struct nd_region *nd_region)
{
<Omitted>
ret = btt_blk_init(btt);
if (ret) {
dev_err(dev, "init: error in blk_init: %d\n", ret);
return NULL;
}
btt_debugfs_init(btt);
return btt;
}
static const struct block_device_operations btt_fops = {
.owner = THIS_MODULE,
.rw_page = btt_rw_page,
.getgeo = btt_getgeo,
.revalidate_disk = nvdimm_revalidate_disk,
};
static int btt_blk_init(struct btt *btt)
{
struct nd_btt *nd_btt = btt->nd_btt;
struct nd_namespace_common *ndns = nd_btt->ndns;
<Omitted>
nvdimm_namespace_disk_name(ndns, btt->btt_disk->disk_name);
btt->btt_disk->first_minor = 0;
btt->btt_disk->fops = &btt_fops;
btt->btt_disk->private_data = btt;
btt->btt_disk->queue = btt->btt_queue;
btt->btt_disk->flags = GENHD_FL_EXT_DEVT;
btt->btt_disk->queue->backing_dev_info->capabilities |=
BDI_CAP_SYNCHRONOUS_IO;
<Omitted>
}
btt.c
static int btt_rw_page(struct block_device *bdev, sector_t sector,
struct page *page, unsigned int op)
{
struct btt *btt = bdev->bd_disk->private_data;
int rc;
unsigned int len;
len = hpage_nr_pages(page) * PAGE_SIZE;
rc = btt_do_bvec(btt, NULL, page, len, 0, op, sector);
if (rc == 0)
page_endio(page, op_is_write(op), 0);
return rc;
}
static int btt_do_bvec(struct btt *btt, struct bio_integrity_payload *bip,
struct page *page, unsigned int len, unsigned int off,
unsigned int op, sector_t sector)
{
int ret;
if (!op_is_write(op)) {
ret = btt_read_pg(btt, bip, page, off, sector, len);
flush_dcache_page(page);
} else {
flush_dcache_page(page);
ret = btt_write_pg(btt, bip, sector, page, off, len);
}
return ret;
}
static int btt_read_pg(struct btt *btt, struct bio_integrity_payload *bip,
struct page *page, unsigned int off, sector_t sector,
unsigned int len)
{
int ret = 0;
int t_flag, e_flag;
struct arena_info *arena = NULL;
u32 lane = 0, premap, postmap;
while (len) {
<Omitted>
ret = btt_map_read(arena, premap, &postmap, &t_flag, &e_flag,
NVDIMM_IO_ATOMIC);
if (ret)
goto out_lane;
<Omitted>
}
static int btt_map_read(struct arena_info *arena, u32 lba, u32 *mapping,
int *trim, int *error, unsigned long rwb_flags)
{
<Omitted>
ret = arena_read_bytes(arena, ns_off, &in, MAP_ENT_SIZE, rwb_flags);
<Omitted>
}
static int arena_read_bytes(struct arena_info *arena, resource_size_t offset,
void *buf, size_t n, unsigned long flags)
{
struct nd_btt *nd_btt = arena->nd_btt;
struct nd_namespace_common *ndns = nd_btt->ndns;
/* arena offsets may be shifted from the base of the device */
offset = adjust_initial_offset(nd_btt, offset);
return nvdimm_read_bytes(ndns, offset, buf, n, flags);
}
include/nd.h
/**
* nvdimm_read_bytes() - synchronously read bytes from an nvdimm namespace
* @ndns: device to read
* @offset: namespace-relative starting offset
* @buf: buffer to fill
* @size: transfer length
*
* @buf is up-to-date upon return from this routine.
*/
static inline int nvdimm_read_bytes(struct nd_namespace_common *ndns,
resource_size_t offset, void *buf, size_t size,
unsigned long flags)
{
return ndns->rw_bytes(ndns, offset, buf, size, READ, flags);
}
ndns-> rw_bytes = nsblk_rw_bytes;
So ...
blk.c
static int nsblk_rw_bytes(struct nd_namespace_common *ndns,
resource_size_t offset, void *iobuf, size_t n, int rw,
unsigned long flags)
{
struct nd_namespace_blk *nsblk = to_nd_namespace_blk(&ndns->dev);
struct nd_blk_region *ndbr = to_ndbr(nsblk);
resource_size_t dev_offset;
dev_offset = to_dev_offset(nsblk, offset, n);
if (unlikely(offset + n > nsblk->size)) {
dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n");
return -EFAULT;
}
if (dev_offset == SIZE_MAX)
return -EIO;
return ndbr->do_io(ndbr, dev_offset, iobuf, n, rw);
}
The rest is the same as the path to memory access described earlier.
The NVDIMM implementation of the linux kernel is scattered not only in root / drivers / nvdimm, but also in drivers / acpi / nfit. Let's throw away the faint expectation that "just read a little, you may understand!"
Recommended Posts